A long answer for TLDR;
- use combinations, not permutations to drive set of places that need distance calculation
- using
numba and haversine function is more that 10X faster than geopy.distance
set up
import requests
import pandas as pd
searchendpoint = "https://directory.spineservices.nhs.uk/ORD/2-0-0/organisations"
# get all healthcare facilities in Herefordshire
dfhc = pd.concat([pd.json_normalize(requests
.get(searchendpoint, params={"PostCode":f"HR{i}","Status":"Active"})
.json()["Organisations"])
for i in range(1,10)]).reset_index(drop=True)
# get geo data for all postcodes associated with healthcare facilities
# API batch restriction of 100 post codes per call
dfgeo = (pd.concat([pd.json_normalize(
requests.post("http://api.postcodes.io/postcodes",
json={"postcodes":dfhc.PostCode.unique().tolist()[i:i+100]}).json(),
record_path="result")
for i in range(0, len(dfhc.PostCode.unique()), 100)])
.rename(columns=lambda c: c.replace("result.",""))
.reset_index(drop=True)
.assign(coord=lambda dfa: dfa.longitude.combine(dfa.latitude, lambda x,y: (x,y,)))
)
dfgeo_missed = dfgeo.loc[dfgeo.postcode.isna()]
dfgeo = dfgeo.loc[~dfgeo.postcode.isna()]
numba / geopy / haversine distance
import geopy.distance
from numba import jit
import numpy as np
# a few ways to calculate distance between two (lon,lat) pairs
@jit(nopython=True)
def haversine_jit(x,y):
# approximate radius of earth in km
R = 6373.0
s_lat = np.deg2rad(x[0])
s_lng = np.deg2rad(x[1])
e_lat = np.deg2rad(y[0])
e_lng = np.deg2rad(y[1])
d = np.sin((e_lat - s_lat)/2)**2 + \
np.cos(s_lat)*np.cos(e_lat) * \
np.sin((e_lng - s_lng)/2)**2
return 2 * R * np.arcsin(np.sqrt(d))
def haversine(x,y):
# approximate radius of earth in km
R = 6373.0
s_lat = np.deg2rad(x[0])
s_lng = np.deg2rad(x[1])
e_lat = np.deg2rad(y[0])
e_lng = np.deg2rad(y[1])
d = np.sin((e_lat - s_lat)/2)**2 + \
np.cos(s_lat)*np.cos(e_lat) * \
np.sin((e_lng - s_lng)/2)**2
return 2 * R * np.arcsin(np.sqrt(d))
def geopykm(x,y):
return geopy.distance.distance(x,y).km
optimisations
import itertools
# optimisation - use just combinations not permuations of locations
dfcombis = (pd.DataFrame(itertools.combinations(dfgeo.postcode.values, 2))
.merge(dfgeo.loc[:,["postcode","coord","longitude","latitude"]], left_on=0, right_on="postcode")
.merge(dfgeo.loc[:,["postcode","coord","longitude","latitude"]], left_on=1, right_on="postcode")
.drop(columns=[0,1]))
def testit(df, calc=geopykm, col="km"):
return df.assign(**{col:df.coord_x.combine(df.coord_y, calc)})
%timeit dfx = testit(dfcombis)
%timeit dfx = testit(dfcombis, calc=haversine)
%timeit dfx = testit(dfcombis, calc=haversine_jit)
dfx = testit(dfcombis, calc=haversine_jit, col="km")
timings
1.77 s ± 63.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
280 ms ± 16.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
125 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
usage
# a to b is same as b to a, concat the reverse
# some locations have multiple facilities, include a to a is 0kmn
dfnb = pd.concat([
dfx.loc[dfx.km.le(10),["postcode_x","postcode_y","km"]],
dfx.loc[dfx.km.le(10),["postcode_y","postcode_x","km"]].rename(columns={"postcode_x":"postcode_y","postcode_y":"postcode_x"}),
pd.DataFrame({"postcode_x":dfhc.PostCode.unique(),"postcode_y":dfhc.PostCode.unique(),"km":0})
],).reset_index(drop=True)
# finally some analysis, find nearest pharmacies to GP surgery
(dfnb.merge(dfhc.loc[dfhc.PrimaryRoleId.isin(["RO180","RO96"]),["Name","PostCode","PrimaryRoleDescription"]],
left_on="postcode_x", right_on="PostCode")
.merge(dfhc.loc[dfhc.PrimaryRoleId.isin(["RO182","RO177"]),["Name","PostCode","PrimaryRoleDescription"]],
left_on="postcode_y", right_on="PostCode")
.sort_values(["Name_x","km"])
.groupby(["Name_x"], as_index=False).first()
)
|
Name_x |
postcode_x |
postcode_y |
km |
PostCode_x |
PrimaryRoleDescription_x |
Name_y |
PostCode_y |
PrimaryRoleDescription_y |
| 0 |
22A KING STREET |
HR4 9DA |
HR4 9AA |
0.213861 |
HR4 9DA |
PRIMARY CARE TRUST SITE |
BOOTS UK LIMITED |
HR4 9AA |
PHARMACY |
| 1 |
ALTON STREET SURGERY |
HR9 5AB |
HR9 5AB |
0 |
HR9 5AB |
PRIMARY CARE TRUST SITE |
ALTON STREET SURGERY |
HR9 5AB |
PRESCRIBING COST CENTRE |
| 2 |
AUBREY STREET |
HR4 0BU |
HR4 9AA |
0.148447 |
HR4 0BU |
PRIMARY CARE TRUST SITE |
BOOTS UK LIMITED |
HR4 9AA |
PHARMACY |
| 3 |
AYLESTONE HILL SURGERY |
HR1 1HR |
HR4 9AA |
1.46984 |
HR1 1HR |
BRANCH SURGERY |
BOOTS UK LIMITED |
HR4 9AA |
PHARMACY |
| 4 |
BARRS COURT SCHOOL |
HR1 1EQ |
HR4 9AA |
1.27244 |
HR1 1EQ |
PRIMARY CARE TRUST SITE |
BOOTS UK LIMITED |
HR4 9AA |
PHARMACY |
| 5 |
BELMONT ABBEY |
HR2 9RP |
HR2 9RP |
0 |
HR2 9RP |
PRIMARY CARE TRUST SITE |
CYPS - LINDEN CENTRE |
HR2 9RP |
PRESCRIBING COST CENTRE |
| 6 |
BELMONT HEALTH CENTRE |
HR2 7XT |
HR2 7XT |
0 |
HR2 7XT |
PRIMARY CARE TRUST SITE |
BELMONT MEDICAL CENTRE |
HR2 7XT |
PRESCRIBING COST CENTRE |
| 7 |
BLACKMARSTON SCHOOL |
HR2 7NX |
HR2 7JE |
0.975908 |
HR2 7NX |
PRIMARY CARE TRUST SITE |
ASDA PHARMACY |
HR2 7JE |
PHARMACY |
| 8 |
BOBBLESTOCK SURGERY |
HR4 9LP |
HR4 9AA |
3.5643 |
HR4 9LP |
BRANCH SURGERY |
BOOTS UK LIMITED |
HR4 9AA |
PHARMACY |
| 9 |
BODENHAM SURGERY |
HR1 3JU |
HR6 8LR |
9.71357 |
HR1 3JU |
PRIMARY CARE TRUST SITE |
BOOTS UK LIMITED |
HR6 8LR |
PHARMACY |
| 10 |
DENTAL ACCESS CENTRE/HEALTH PROMOTION |
HR2 7JE |
HR2 7JE |
0 |
HR2 7JE |
PRIMARY CARE TRUST SITE |
ASDA PHARMACY |
HR2 7JE |
PHARMACY |
| 11 |
ETNAM STREET MENTAL HEALTH RESOURCE CENTRE |
HR6 8AN |
HR6 8LR |
0.557963 |
HR6 8AN |
PRIMARY CARE TRUST SITE |
BOOTS UK LIMITED |
HR6 8LR |
PHARMACY |
| 12 |
KINGTON COURT HEALTH AND SOCIAL CARE CENTRE |
HR5 3BX |
HR5 3BJ |
0.649622 |
HR5 3BX |
PRIMARY CARE TRUST SITE |
KINGTON PHARMACY |
HR5 3BJ |
PHARMACY |
| 13 |
KINGTON SURGERY |
HR5 3EA |
HR5 3EA |
0 |
HR5 3EA |
PRIMARY CARE TRUST SITE |
KINGTON MEDICAL PRACTICE |
HR5 3EA |
PRESCRIBING COST CENTRE |
distance.distance? Can it natively process numpy arrays?