Processing One Billion Rows
… with
PHP
Florian
Engelhardt
1brc?
The output
{
Abha=-23.0/18.0/59.2,
Abidjan=-16.2/26.0/67.3,
Abéché=-10.0/29.4/69.0,
Accra=-10.1/26.4/66.4,
Addis Ababa=-23.7/16.0/67.0,
Adelaide=-27.8/17.3/58.5
...
}
The rules
- no external library dependencies
- single source file
- computation at runtime
Let’s give it a try, shall we?
| $stations = []; |
| $fp = fopen('measurements.txt', 'r'); |
| |
| while ($data = fgetcsv($fp, null, ';')) { |
| if (!isset($stations[$data[0]])) { |
| $stations[$data[0]] = [ |
| $data[1], |
| $data[1], |
| $data[1], |
| 1 |
| ]; |
| } else { |
| $stations[$data[0]][3] ++; |
| $stations[$data[0]][2] += $data[1]; |
| if ($data[1] < $stations[$data[0]][0]) { |
| $stations[$data[0]][0] = $data[1]; |
| } |
| if ($data[1] > $stations[$data[0]][1]) { |
| $stations[$data[0]][1] = $data[1]; |
| } |
| } |
| } |
| |
| ksort($stations); |
| |
| echo '{'; |
| foreach($stations as $k=>&$station) { |
| $station[2] = $station[2]/$station[3]; |
| echo $k, '=', $station[0], '/', $station[2], '/', $station[1], ', '; |
| } |
| echo '}'; |
| $stations = []; |
| $fp = fopen('measurements.txt', 'r'); |
| |
| while ($data = fgetcsv($fp, null, ';')) { |
| if (!isset($stations[$data[0]])) { |
| $stations[$data[0]] = [ |
| $data[1], |
| $data[1], |
| $data[1], |
| 1 |
| ]; |
| } else { |
| $stations[$data[0]][3] ++; |
| $stations[$data[0]][2] += $data[1]; |
| if ($data[1] < $stations[$data[0]][0]) { |
| $stations[$data[0]][0] = $data[1]; |
| } |
| if ($data[1] > $stations[$data[0]][1]) { |
| $stations[$data[0]][1] = $data[1]; |
| } |
| } |
| } |
| |
| ksort($stations); |
| |
| echo '{'; |
| foreach($stations as $k=>&$station) { |
| $station[2] = $station[2]/$station[3]; |
| echo $k, '=', $station[0], '/', $station[2], '/', $station[1], ', '; |
| } |
| echo '}'; |
| $stations = []; |
| $fp = fopen('measurements.txt', 'r'); |
| |
| while ($data = fgetcsv($fp, null, ';')) { |
| if (!isset($stations[$data[0]])) { |
| $stations[$data[0]] = [ |
| $data[1], |
| $data[1], |
| $data[1], |
| 1 |
| ]; |
| } else { |
| $stations[$data[0]][3] ++; |
| $stations[$data[0]][2] += $data[1]; |
| if ($data[1] < $stations[$data[0]][0]) { |
| $stations[$data[0]][0] = $data[1]; |
| } |
| if ($data[1] > $stations[$data[0]][1]) { |
| $stations[$data[0]][1] = $data[1]; |
| } |
| } |
| } |
| |
| ksort($stations); |
| |
| echo '{'; |
| foreach($stations as $k=>&$station) { |
| $station[2] = $station[2]/$station[3]; |
| echo $k, '=', $station[0], '/', $station[2], '/', $station[1], ', '; |
| } |
| echo '}'; |
| $stations = []; |
| $fp = fopen('measurements.txt', 'r'); |
| |
| while ($data = fgetcsv($fp, null, ';')) { |
| if (!isset($stations[$data[0]])) { |
| $stations[$data[0]] = [ |
| $data[1], |
| $data[1], |
| $data[1], |
| 1 |
| ]; |
| } else { |
| $stations[$data[0]][3] ++; |
| $stations[$data[0]][2] += $data[1]; |
| if ($data[1] < $stations[$data[0]][0]) { |
| $stations[$data[0]][0] = $data[1]; |
| } |
| if ($data[1] > $stations[$data[0]][1]) { |
| $stations[$data[0]][1] = $data[1]; |
| } |
| } |
| } |
| |
| ksort($stations); |
| |
| echo '{'; |
| foreach($stations as $k=>&$station) { |
| $station[2] = $station[2]/$station[3]; |
| echo $k, '=', $station[0], '/', $station[2], '/', $station[1], ', '; |
| } |
| echo '}'; |
| $stations = []; |
| $fp = fopen('measurements.txt', 'r'); |
| |
| while ($data = fgetcsv($fp, null, ';')) { |
| if (!isset($stations[$data[0]])) { |
| $stations[$data[0]] = [ |
| $data[1], |
| $data[1], |
| $data[1], |
| 1 |
| ]; |
| } else { |
| $stations[$data[0]][3] ++; |
| $stations[$data[0]][2] += $data[1]; |
| if ($data[1] < $stations[$data[0]][0]) { |
| $stations[$data[0]][0] = $data[1]; |
| } |
| if ($data[1] > $stations[$data[0]][1]) { |
| $stations[$data[0]][1] = $data[1]; |
| } |
| } |
| } |
| |
| ksort($stations); |
| |
| echo '{'; |
| foreach($stations as $k=>&$station) { |
| $station[2] = $station[2]/$station[3]; |
| echo $k, '=', $station[0], '/', $station[2], '/', $station[1], ', '; |
| } |
| echo '}'; |
| $stations = []; |
| $fp = fopen('measurements.txt', 'r'); |
| |
| while ($data = fgetcsv($fp, null, ';')) { |
| if (!isset($stations[$data[0]])) { |
| $stations[$data[0]] = [ |
| $data[1], |
| $data[1], |
| $data[1], |
| 1 |
| ]; |
| } else { |
| $stations[$data[0]][3] ++; |
| $stations[$data[0]][2] += $data[1]; |
| if ($data[1] < $stations[$data[0]][0]) { |
| $stations[$data[0]][0] = $data[1]; |
| } |
| if ($data[1] > $stations[$data[0]][1]) { |
| $stations[$data[0]][1] = $data[1]; |
| } |
| } |
| } |
| |
| ksort($stations); |
| |
| echo '{'; |
| foreach($stations as $k=>&$station) { |
| $station[2] = $station[2]/$station[3]; |
| echo $k, '=', $station[0], '/', $station[2], '/', $station[1], ', '; |
| } |
| echo '}'; |
| $stations = []; |
| $fp = fopen('measurements.txt', 'r'); |
| |
| while ($data = fgetcsv($fp, null, ';')) { |
| if (!isset($stations[$data[0]])) { |
| $stations[$data[0]] = [ |
| $data[1], |
| $data[1], |
| $data[1], |
| 1 |
| ]; |
| } else { |
| $stations[$data[0]][3] ++; |
| $stations[$data[0]][2] += $data[1]; |
| if ($data[1] < $stations[$data[0]][0]) { |
| $stations[$data[0]][0] = $data[1]; |
| } |
| if ($data[1] > $stations[$data[0]][1]) { |
| $stations[$data[0]][1] = $data[1]; |
| } |
| } |
| } |
| |
| ksort($stations); |
| |
| echo '{'; |
| foreach($stations as $k=>&$station) { |
| $station[2] = $station[2]/$station[3]; |
| echo $k, '=', $station[0], '/', $station[2], '/', $station[1], ', '; |
| } |
| echo '}'; |
What is happening in fgetcsv()
?
Switch to fgets()
and substr()
!
| while ($data = fgets($fp, 108)) { |
| $pos = strpos($data, ';'); |
| $city = substr($data, 0, $pos); |
| $temp = substr($data, $pos+1, -1); |
| |
Whats going on?
| while ($data = fgets($fp, 108)) { |
| $pos = strpos($data, ';'); |
| $city = substr($data, 0, $pos); |
| $temp = substr($data, $pos+1, -1); |
| |
| if (!isset($stations[$city])) { |
| $stations[$city] = [ |
| $temp, |
| $temp, |
| $temp, |
| 1 |
| ]; |
| } else { |
| $stations[$city][3] ++; |
| $stations[$city][2] += $temp; |
| if ($temp < $stations[$city][0]) { |
| $stations[$city][0] = $temp; |
| } |
| if ($temp > $stations[$city][1]) { |
| $stations[$city][1] = $temp; |
| } |
| } |
| } |
References FTW
| while ($data = fgets($fp, 108)) { |
| $pos = strpos($data, ';'); |
| $city = substr($data, 0, $pos); |
| $temp = substr($data, $pos+1, -1); |
| |
| if (!isset($stations[$city])) { |
| $stations[$city] = [ |
| $temp, |
| $temp, |
| $temp, |
| 1 |
| ]; |
| } else { |
| $station = &$stations[$city]; |
| $station[3] ++; |
| $station[2] += $temp; |
| if ($temp < $station[0]) { |
| $station[0] = $temp; |
| } |
| if ($temp > $station[1]) { |
| $station[1] = $temp; |
| } |
| } |
| } |
Less compares
| if ($temp < $station[0]) { |
| $station[0] = $temp; |
| } |
| if ($temp > $station[1]) { |
| $station[1] = $temp; |
| } |
| if ($temp < $station[0]) { |
| $station[0] = $temp; |
| } elseif ($temp > $station[1]) { |
| $station[1] = $temp; |
| } |
Runtime:
17m 30s
Bad bad type juggling!
| $a = "100"; |
| $b = "25.5"; |
| if ($a > $b) { |
| echo "100 > 25.5"; |
| } else { |
| echo "25.5 > 100"; |
| } |
If both operands are numeric strings, or one operand is a number and
the other one is a numeric string, then the comparison is done
numerically.
No juggling!
| while ($data = fgets($fp, 108)) { |
| $pos = strpos($data, ';'); |
| $city = substr($data, 0, $pos); |
| $temp = (float)substr($data, $pos+1, -1); |
| |
| if (!isset($stations[$city])) { |
| $stations[$city] = [ |
| $temp, |
| $temp, |
| $temp, |
| 1 |
| ]; |
| } else { |
| $station = &$stations[$city]; |
| $station[3] ++; |
| $station[2] += $temp; |
| if ($temp < $station[0]) { |
| $station[0] = $temp; |
| } elseif ($temp > $station[1]) { |
| $station[1] = $temp; |
| } |
| } |
| } |
| while ($data = fgets($fp, 108)) { |
| $pos = strpos($data, ';'); |
| $city = substr($data, 0, $pos); |
| $temp = (float)substr($data, $pos+1, -1); |
| |
| if (!isset($stations[$city])) { |
| $stations[$city] = [ |
| $temp, |
| $temp, |
| $temp, |
| 1 |
| ]; |
| } else { |
| $station = &$stations[$city]; |
| $station[3] ++; |
| $station[2] += $temp; |
| if ($temp < $station[0]) { |
| $station[0] = $temp; |
| } elseif ($temp > $station[1]) { |
| $station[1] = $temp; |
| } |
| } |
| } |
| while ($data = fgets($fp, 108)) { |
| $pos = strpos($data, ';'); |
| $city = substr($data, 0, $pos); |
| $temp = (float)substr($data, $pos+1, -1); |
| |
| if (!isset($stations[$city])) { |
| $stations[$city] = [ |
| $temp, |
| $temp, |
| $temp, |
| 1 |
| ]; |
| } else { |
| $station = &$stations[$city]; |
| $station[3] ++; |
| $station[2] += $temp; |
| if ($temp < $station[0]) { |
| $station[0] = $temp; |
| } elseif ($temp > $station[1]) { |
| $station[1] = $temp; |
| } |
| } |
| } |
Whats going on?

Something important about PHP on CLI
No OPcache …
… and with no OPcache comes …
opcache.enable_cli = 1
opcache.jit-buffer-size = 10M
Is there more?
Divide et impera
Demo time!
Bonus!
| if (!isset($stations[$city])) { |
| |
| } else { |
| $station = &$stations[$city]; |
| |
| } |
| $station = &$stations[$city]; |
| if ($station === NULL) { |
| |
| } else { |
| |
| } |
Bonus!!
| while ($data = fgets($fp)) { |
| $chunk_start += strlen($data); |
| if ($chunk_start > $chunk_end) { |
| break; |
| } |
| |
| while ($chunk_start < $chunk_end) { |
| $data = fgets($fp); |
| $chunk_start += strlen($data); |
| |
Bonus!!!
| $data = fgets($fp); |
| $chunk_start += strlen($data); |
| $pos = strpos($data, ';'); |
| $city = substr($data, 0, $pos); |
| $temp = (float)substr($data, $pos+1, -1); |
| $city = stream_get_line($fp, 99, ';'); |
| $temp = stream_get_line($fp, 99, "\n"); |
| $chunk_start += strlen($city) + strlen($temp) + 2; |
| $temp = (float)$temp; |
25 minutes can be okay
PHP is fast!