Skip to content

Commit ecd94e3

Browse files
authored
Merge pull request #27 from torreyma/master
added pandas multiprocessing example in examples/ directory
2 parents 9eac4f4 + cdf0fbe commit ecd94e3

1 file changed

Lines changed: 72 additions & 0 deletions

File tree

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# from: https://gist.github.com/ishiland/824ddd386fcd0b90fc55aea573a28b22
2+
# written by ishiland: https://github.com/ishiland
3+
# Minor edits by torreyma: https://github.com/torreyma
4+
#
5+
from geosupport import Geosupport, GeosupportError
6+
import pandas as pd
7+
from multiprocessing import Pool, cpu_count
8+
from functools import partial
9+
import numpy as np
10+
11+
"""
12+
Example of how to use python-geosupport, Pandas and Multiprocessing to speed up geocoding workflows.
13+
"""
14+
15+
# For Windows:
16+
g = Geosupport(geosupport_path="C:\\Program Files (x86)\\Geosupport Desktop Edition")
17+
# On linux, geosupport location is set in environment variables GEOFILES and LD_LIBRARY_PATH.
18+
19+
cpus = cpu_count()
20+
21+
22+
def geo_by_address(row):
23+
"""
24+
Geocodes a pandas row containing address atributes.
25+
26+
:param row: Pandas Series
27+
:return: Pandas Series with lat, lon & Geosupport message.
28+
"""
29+
try:
30+
result = g.address(house_number=row['PHN'], street_name=row['STREET'], zip=row['ZIP_CODE']) # Adjust these to match your data column names
31+
lat = result.get("Latitude")
32+
lon = result.get('Longitude')
33+
msg = result.get('Message')
34+
except GeosupportError as ge:
35+
lat = "Error"
36+
lon = "Error"
37+
msg = str(ge)
38+
pass
39+
return pd.Series([lat, lon, msg])
40+
41+
42+
def parallelize(data, func, num_of_processes=cpus):
43+
data_split = np.array_split(data, num_of_processes)
44+
pool = Pool(num_of_processes)
45+
data = pd.concat(pool.map(func, data_split))
46+
pool.close()
47+
pool.join()
48+
return data
49+
50+
51+
def run_on_subset(func, data_subset):
52+
return data_subset.apply(func, axis=1)
53+
54+
55+
def parallelize_on_rows(data, func, num_of_processes=cpus):
56+
return parallelize(data, partial(run_on_subset, func), num_of_processes)
57+
58+
59+
if __name__ == '__main__':
60+
61+
# read in csv
62+
df = pd.read_csv('INPUT.csv')
63+
64+
# add 3 Geosupport columns - Latitude, Longitude and Geosupport message
65+
df[['lat', 'lon', 'msg']] = parallelize_on_rows(df, geo_by_address)
66+
67+
# output to csv with the 3 new columns.
68+
df.to_csv('OUTPUT.csv')
69+
70+
71+
72+

0 commit comments

Comments
 (0)