fullo
diff --git a/‎examples/01-string-processing.php‎
Lines changed: 29 additions & 16 deletions b/‎examples/01-string-processing.php‎
Lines changed: 29 additions & 16 deletions
diff --git a/‎examples/02-database-simulation.php‎
Lines changed: 26 additions & 9 deletions b/‎examples/02-database-simulation.php‎
Lines changed: 26 additions & 9 deletions
diff --git a/‎examples/README.md‎
Lines changed: 13 additions & 13 deletions b/‎examples/README.md‎
Lines changed: 13 additions & 13 deletions
@@ -8,9 +8,9 @@
  * Simulates building an HTML report from 5,000 records.
  * Run 3 times with increasing iteration number to see SCI drop:
  *
- *   php 01-string-processing.php 1    ← naive: .= in loop
- *   php 01-string-processing.php 2    ← fix: array + implode
- *   php 01-string-processing.php 3    ← refined: sprintf + single-pass stats
+ *   php 01-string-processing.php 1    ← naive: .= in loop + str_replace on 2MB + substr_count
+ *   php 01-string-processing.php 2    ← fix: array + implode (but still 2 loops)
+ *   php 01-string-processing.php 3    ← refined: sprintf + single-pass stats in one loop
  *
  * @author fullo <https://github.com/fullo>
  * @license MIT
@@ -20,10 +20,11 @@
 $iteration = (int) ($argv[1] ?? 1);
 echo "=== String Processing — iteration {$iteration}/3 ===\n";
 
-// ── Generate 5,000 user records (same seed for all iterations) ──
+// ── Generate 20,000 user records (same seed for all iterations) ──
+// Larger dataset makes string handling differences measurable.
 mt_srand(42);
 $users = [];
-for ($i = 0; $i < 5000; $i++) {
+for ($i = 0; $i < 20000; $i++) {
     $users[] = [
         'id' => $i + 1,
         'name' => 'User ' . str_pad((string) ($i + 1), 4, '0', STR_PAD_LEFT),
@@ -42,14 +43,18 @@
 $footer = '</tbody></table>';
 
 match ($iteration) {
-    // ── Iteration 1: Naive — string concatenation in a loop ──
-    // Each .= copies the entire $html string (O(n²) memory operations).
+    // ── Iteration 1: Maximally naive — concatenation + redundant processing ──
+    // 7 separate .= per row (each copies the entire growing string).
+    // After building the HTML, runs str_replace on the full string to
+    // "fix" the CSS class names — a common anti-pattern in legacy code.
+    // Then re-counts everything in separate loops.
     1 => (function () use ($users, $header, $footer): string {
         $html = $header;
 
         foreach ($users as $user) {
             $class = $user['active'] ? '' : ' class="inactive"';
             $status = $user['active'] ? 'Active' : 'Inactive';
+            // 7 separate concatenations per row — each copies entire $html
             $html .= '<tr' . $class . '>';
             $html .= '<td>' . $user['id'] . '</td>';
             $html .= '<td>' . htmlspecialchars($user['name']) . '</td>';
@@ -61,25 +66,33 @@
 
         $html .= $footer;
 
-        // Summary: second loop over all users
-        $active = 0;
+        // Wasteful: "fix" class names via str_replace on the entire ~2MB string
+        $html = str_replace('class="inactive"', 'class="user-inactive"', $html);
+        $html = str_replace('class="user-inactive"', 'class="inactive"', $html);
+
+        // Wasteful: count active users by parsing the HTML we just built
+        $active = substr_count($html, '<td>Active</td>');
+        $inactive = substr_count($html, '<td>Inactive</td>');
+
+        // Wasteful: compute total score in a separate loop
         $total = 0.0;
         foreach ($users as $user) {
-            if ($user['active']) {
-                $active++;
-            }
             $total += $user['score'];
         }
-        $html .= '<p>Active: ' . $active . '/' . count($users) . '</p>';
+
+        $html .= '<p>Active: ' . $active . '/' . ($active + $inactive) . '</p>';
         $html .= '<p>Avg score: ' . number_format($total / count($users), 2) . '</p>';
         $html .= '</body></html>';
 
         echo 'Output: ' . strlen($html) . " bytes | Active: {$active}\n";
         return $html;
     })(),
 
-    // ── Iteration 2: Fix — array + implode, single allocation ──
-    // Each $parts[] = '...' is O(1). implode() does one allocation at the end.
+    // ── Iteration 2: array + implode, but still two loops ──
+    // Fixed: no more .= concatenation. Uses array + implode.
+    // Remaining issue: summary stats computed in a separate second loop
+    // over all 20,000 records. Also uses string concatenation for each row
+    // instead of sprintf.
     2 => (function () use ($users, $header, $footer): string {
         $parts = [$header];
 
@@ -97,7 +110,7 @@
 
         $parts[] = $footer;
 
-        // Summary: still a second loop
+        // Still a second loop for summary — iterates 20,000 records again
         $active = 0;
         $total = 0.0;
         foreach ($users as $user) {
 
@@ -9,8 +9,8 @@
  * Uses usleep() to simulate real database query latency (50μs per query).
  *
  *   php 02-database-simulation.php 1    ← naive: N+1 queries (1,001 total)
- *   php 02-database-simulation.php 2    ← fix: 3 batch queries + hash join
- *   php 02-database-simulation.php 3    ← refined: batch + inline aggregation
+ *   php 02-database-simulation.php 2    ← fix: 3 batch queries, but linear scan join O(n²)
+ *   php 02-database-simulation.php 3    ← refined: 3 batch + hash-map O(1) + inline aggregation
  *
  * @author fullo <https://github.com/fullo>
  * @license MIT
@@ -101,18 +101,34 @@ function dbQuery(string $description): void
         echo "Orders: " . count($results) . " | Queries: {$queryCount} | Revenue: $" . number_format($revenue, 2) . "\n";
     })(),
 
-    // ── Iteration 2: 3 batch queries ──
-    // Fetch all data upfront, join in PHP with O(1) hash lookups.
+    // ── Iteration 2: 3 batch queries, but linear scan for join ──
+    // Good: only 3 queries instead of 1,001.
+    // Bad: customer lookup uses array_filter (O(n) per order = O(n²) total)
+    // instead of indexed array access. Also builds a flat customer list
+    // first, losing the indexed structure.
     2 => (function () use ($orders, $customers, $orderItems, &$queryCount): void {
         dbQuery('SELECT * FROM orders');
         dbQuery('SELECT * FROM customers WHERE id IN (...)');
         dbQuery('SELECT * FROM order_items WHERE order_id IN (...)');
         $queryCount = 3;
 
+        // Simulate receiving batch results as flat arrays (no index)
+        $customerList = array_values($customers);
+        $itemsByOrder = [];
+        foreach ($orderItems as $orderId => $items) {
+            foreach ($items as $item) {
+                $itemsByOrder[] = $item;
+            }
+        }
+
         $results = [];
         foreach ($orders as $order) {
-            $customer = $customers[$order['customer_id']];
-            $items = $orderItems[$order['id']];
+            // O(n) linear scan to find customer — array_filter on 200 customers × 500 orders
+            $matches = array_filter($customerList, fn ($c) => $c['id'] === $order['customer_id']);
+            $customer = reset($matches);
+
+            // O(n) linear scan for order items
+            $items = array_filter($itemsByOrder, fn ($i) => $i['order_id'] === $order['id']);
 
             $total = 0.0;
             foreach ($items as $item) {
@@ -131,15 +147,16 @@ function dbQuery(string $description): void
         echo "Orders: " . count($results) . " | Queries: {$queryCount} | Revenue: $" . number_format($revenue, 2) . "\n";
     })(),
 
-    // ── Iteration 3: batch + inline aggregation ──
-    // Same 3 queries, but revenue computed inline — no second loop,
-    // no intermediate $results array (saves memory + CPU).
+    // ── Iteration 3: batch queries + hash-map join + inline aggregation ──
+    // 3 queries + O(1) hash-map lookups + revenue computed inline.
+    // No intermediate $results array, no second summary loop.
     3 => (function () use ($orders, $customers, $orderItems, &$queryCount): void {
         dbQuery('SELECT * FROM orders');
         dbQuery('SELECT * FROM customers WHERE id IN (...)');
         dbQuery('SELECT * FROM order_items WHERE order_id IN (...)');
         $queryCount = 3;
 
+        // $customers and $orderItems are already indexed by ID — O(1) lookup
         $revenue = 0.0;
         $count = 0;
 
 
@@ -32,35 +32,35 @@ The `run-all.sh` script:
 
 ## Examples
 
-### 01 — String Processing
+### 01 — String Processing (20,000 records)
 
 | Iteration | Approach | SCI |
 |-----------|----------|-----|
-| 1 (naive) | `.=` concatenation in loop — O(n²) memory copies | 0.035 mgCO2eq |
-| 2 (optimized) | Array of parts + `implode()` — O(n) allocation | 0.030 mgCO2eq |
-| 3 (refined) | `sprintf` per row + single-pass stats — no second loop | 0.026 mgCO2eq |
+| 1 (naive) | `.=` in loop (7 per row) + `str_replace` on 2MB + `substr_count` | 0.182 mgCO2eq |
+| 2 (optimized) | Array + `implode()` (but still 2 loops for summary) | 0.154 mgCO2eq |
+| 3 (refined) | `sprintf` per row + single-pass stats in one loop | 0.104 mgCO2eq |
 
-**Total reduction: ~30%**
+**Total reduction: ~43%**
 
-### 02 — Database Simulation (N+1 Queries)
+### 02 — Database Simulation (N+1 → Batch)
 
 | Iteration | Approach | SCI |
 |-----------|----------|-----|
-| 1 (naive) | N+1 queries: 1,001 total (50μs each) | 0.468 mgCO2eq |
-| 2 (optimized) | 3 batch queries + hash-map join | 0.008 mgCO2eq |
-| 3 (refined) | Batch + inline aggregation, no intermediate array | 0.007 mgCO2eq |
+| 1 (naive) | N+1 queries: 1,001 total (50μs each) | 0.464 mgCO2eq |
+| 2 (optimized) | 3 batch queries, but linear scan O(n²) for join | 0.211 mgCO2eq |
+| 3 (refined) | 3 batch + hash-map O(1) join + inline aggregation | 0.007 mgCO2eq |
 
 **Total reduction: ~98%**
 
 ### 03 — JSON API Processing (10,000 events)
 
 | Iteration | Approach | SCI |
 |-----------|----------|-----|
-| 1 (naive) | Double decode, sort, 6 `array_filter` passes, per-record `json_encode` | 0.506 mgCO2eq |
-| 2 (optimized) | Single-pass aggregation + one `json_encode` | 0.219 mgCO2eq |
-| 3 (refined) | Regex extraction from raw JSON — no full decode at all | 0.151 mgCO2eq |
+| 1 (naive) | Double decode, sort, 6 `array_filter` passes, per-record `json_encode` | 0.453 mgCO2eq |
+| 2 (optimized) | Single-pass aggregation + one `json_encode` | 0.212 mgCO2eq |
+| 3 (refined) | Regex extraction from raw JSON — no full decode at all | 0.144 mgCO2eq |
 
-**Total reduction: ~70%**
+**Total reduction: ~68%**
 
 ## Generated Reports