|
8 | 8 | * Simulates a backend that receives a 3,000-event JSON payload, |
9 | 9 | * filters, aggregates, and produces a summary response. |
10 | 10 | * |
11 | | - * php 03-json-api.php 1 ← naive: multiple filter passes, repeated encode/decode |
| 11 | + * php 03-json-api.php 1 ← naive: double decode, sort, 6 filter passes, per-record encode |
12 | 12 | * php 03-json-api.php 2 ← fix: single-pass aggregation, one encode |
13 | | - * php 03-json-api.php 3 ← refined: isset() lookups + pre-allocated counters |
| 13 | + * php 03-json-api.php 3 ← refined: regex extraction from raw JSON, no full decode |
14 | 14 | * |
15 | 15 | * @author fullo <https://github.com/fullo> |
16 | 16 | * @license MIT |
|
20 | 20 | $iteration = (int) ($argv[1] ?? 1); |
21 | 21 | echo "=== JSON API Processing — iteration {$iteration}/3 ===\n"; |
22 | 22 |
|
23 | | -// ── Generate a 3,000-event JSON payload (same seed) ── |
| 23 | +// ── Generate a 10,000-event JSON payload (same seed) ── |
| 24 | +// Larger payload makes efficiency differences measurable. |
24 | 25 | mt_srand(42); |
25 | 26 |
|
26 | 27 | $records = []; |
27 | | -for ($i = 0; $i < 3000; $i++) { |
| 28 | +for ($i = 0; $i < 10000; $i++) { |
28 | 29 | $records[] = [ |
29 | 30 | 'id' => $i + 1, |
30 | 31 | 'timestamp' => date('c', strtotime("-{$i} minutes")), |
|
43 | 44 | $payload = json_encode(['events' => $records], JSON_THROW_ON_ERROR); |
44 | 45 |
|
45 | 46 | match ($iteration) { |
46 | | - // ── Iteration 1: multiple filter passes + repeated encode/decode ── |
47 | | - // array_filter creates a new array copy each time. |
48 | | - // json_encode per record is O(n) × payload_size. |
| 47 | + // ── Iteration 1: maximally wasteful — decode/encode/sort/copy everywhere ── |
| 48 | + // Decodes the payload twice, sorts a full copy for no reason, |
| 49 | + // applies 6 array_filter passes (each copying the array), |
| 50 | + // re-encodes every record individually to compute "weight", |
| 51 | + // and verifies the response with yet another decode round-trip. |
49 | 52 | 1 => (function () use ($payload): void { |
50 | | - $decoded = json_decode($payload, true, 512, JSON_THROW_ON_ERROR); |
| 53 | + // Wasteful: decode twice (simulates "validate then process" anti-pattern) |
| 54 | + $validated = json_decode($payload, true, 512, JSON_THROW_ON_ERROR); |
| 55 | + $validatedJson = json_encode($validated, JSON_THROW_ON_ERROR); |
| 56 | + $decoded = json_decode($validatedJson, true, 512, JSON_THROW_ON_ERROR); |
51 | 57 | $events = $decoded['events']; |
52 | 58 |
|
53 | | - // 4 separate filter passes (each scans all 3,000 records) |
54 | | - $pageviews = array_filter($events, fn ($e) => $e['type'] === 'pageview'); |
55 | | - $clicks = array_filter($events, fn ($e) => $e['type'] === 'click'); |
56 | | - $conversions = array_filter($events, fn ($e) => $e['type'] === 'conversion'); |
57 | | - $errors = array_filter($events, fn ($e) => $e['type'] === 'error'); |
| 59 | + // Wasteful: sort the entire array by timestamp before filtering |
| 60 | + // (common anti-pattern: sort everything, then filter to a subset) |
| 61 | + $sorted = $events; |
| 62 | + usort($sorted, fn ($a, $b) => strcmp($a['timestamp'], $b['timestamp'])); |
| 63 | + |
| 64 | + // 4 separate filter passes on the sorted copy (each creates a new array) |
| 65 | + $pageviews = array_filter($sorted, fn ($e) => $e['type'] === 'pageview'); |
| 66 | + $clicks = array_filter($sorted, fn ($e) => $e['type'] === 'click'); |
| 67 | + $conversions = array_filter($sorted, fn ($e) => $e['type'] === 'conversion'); |
| 68 | + $errors = array_filter($sorted, fn ($e) => $e['type'] === 'error'); |
58 | 69 |
|
59 | 70 | // 2 more filter passes for country groups |
60 | | - $usEvents = array_filter($events, fn ($e) => $e['country'] === 'US'); |
61 | | - $euEvents = array_filter($events, fn ($e) => in_array($e['country'], ['DE', 'FR', 'IT', 'GB'])); |
| 71 | + $usEvents = array_filter($sorted, fn ($e) => $e['country'] === 'US'); |
| 72 | + $euEvents = array_filter($sorted, fn ($e) => in_array($e['country'], ['DE', 'FR', 'IT', 'GB'])); |
62 | 73 |
|
63 | | - // Re-encode each record to compute "weight" |
| 74 | + // Wasteful: re-encode each record individually to compute "weight" |
64 | 75 | $totalWeight = 0; |
65 | 76 | foreach ($events as $event) { |
66 | 77 | $totalWeight += strlen(json_encode($event, JSON_THROW_ON_ERROR)); |
67 | 78 | } |
68 | 79 |
|
69 | | - // Build response from separately encoded pieces |
| 80 | + // Wasteful: build response by encoding sub-arrays separately |
70 | 81 | $response = '{"summary":' . json_encode([ |
71 | 82 | 'total' => count($events), |
72 | 83 | 'pageviews' => count($pageviews), |
|
76 | 87 | 'us_events' => count($usEvents), |
77 | 88 | 'eu_events' => count($euEvents), |
78 | 89 | ]) . ',"top_pages":' . json_encode( |
79 | | - array_count_values(array_column($events, 'url')) |
| 90 | + array_count_values(array_column($sorted, 'url')) |
80 | 91 | ) . ',"avg_duration":' . json_encode( |
81 | | - array_sum(array_column($events, 'duration_ms')) / count($events) |
| 92 | + array_sum(array_column($sorted, 'duration_ms')) / count($sorted) |
82 | 93 | ) . '}'; |
83 | 94 |
|
84 | | - // Wasteful verification round-trip |
| 95 | + // Wasteful: decode the response to "verify" it |
85 | 96 | json_decode($response, true, 512, JSON_THROW_ON_ERROR); |
86 | 97 |
|
87 | 98 | echo "Events: " . count($events) . " | Response: " . strlen($response) . " bytes\n"; |
|
130 | 141 | echo "Events: " . count($events) . " | Response: " . strlen($response) . " bytes\n"; |
131 | 142 | })(), |
132 | 143 |
|
133 | | - // ── Iteration 3: isset() lookups + pre-allocated counters ── |
134 | | - // isset() is faster than in_array() for country check. |
135 | | - // No closure overhead for array_filter. No array_column copies. |
| 144 | + // ── Iteration 3: stream-aggregate from raw JSON without full decode ── |
| 145 | + // Instead of json_decode() into a massive PHP array (10,000 objects with |
| 146 | + // metadata, referrers, etc.), use preg_match_all to extract only the 4 |
| 147 | + // fields we need directly from the raw JSON string. No PHP array of |
| 148 | + // 10,000 elements ever created — just counters updated from regex matches. |
| 149 | + // This avoids the ~30MB peak memory allocation of full decode. |
136 | 150 | 3 => (function () use ($payload): void { |
137 | | - $decoded = json_decode($payload, true, 512, JSON_THROW_ON_ERROR); |
138 | | - $events = $decoded['events']; |
139 | | - $total = count($events); |
| 151 | + // Extract fields directly from raw JSON with regex |
| 152 | + // Each record has: "type":"pageview","url":"/page/42","duration_ms":123,"country":"US" |
| 153 | + preg_match_all('/"type":"(\w+)"/', $payload, $typeMatches); |
| 154 | + preg_match_all('/"country":"(\w+)"/', $payload, $countryMatches); |
| 155 | + preg_match_all('/"duration_ms":(\d+)/', $payload, $durationMatches); |
| 156 | + preg_match_all('/"url":"([^"]+)"/', $payload, $urlMatches); |
| 157 | + |
| 158 | + $types = $typeMatches[1]; |
| 159 | + $countries = $countryMatches[1]; |
| 160 | + $durations = $durationMatches[1]; |
| 161 | + $urls = $urlMatches[1]; |
| 162 | + $total = count($types); |
140 | 163 |
|
141 | 164 | $euCountries = ['DE' => true, 'FR' => true, 'IT' => true, 'GB' => true]; |
142 | 165 |
|
|
146 | 169 | $totalDuration = 0; |
147 | 170 | $pageCounts = []; |
148 | 171 |
|
149 | | - foreach ($events as $event) { |
150 | | - $typeCounts[$event['type']]++; |
| 172 | + for ($i = 0; $i < $total; $i++) { |
| 173 | + $typeCounts[$types[$i]]++; |
151 | 174 |
|
152 | | - // isset() on hash map: O(1), faster than in_array() |
153 | | - $country = $event['country']; |
| 175 | + $country = $countries[$i]; |
154 | 176 | if ($country === 'US') { |
155 | 177 | $usCount++; |
156 | 178 | } elseif (isset($euCountries[$country])) { |
157 | 179 | $euCount++; |
158 | 180 | } |
159 | 181 |
|
160 | | - $totalDuration += $event['duration_ms']; |
161 | | - $url = $event['url']; |
| 182 | + $totalDuration += (int) $durations[$i]; |
| 183 | + $url = $urls[$i]; |
162 | 184 | $pageCounts[$url] = ($pageCounts[$url] ?? 0) + 1; |
163 | 185 | } |
164 | 186 |
|
|
0 commit comments