This comprehension returns the expected result by
- Iterating the index
- Applying boolean indexing to each series
- Returning a dictionary for each series
import pandas as pd
d = {'I': ['A', 'B', 'C', 'D'], 'X': [ 1, 0, 3, 1], 'Y': [0, 1, 2, 1], 'Z': [1, 0, 0, 0], 'W': [3, 2, 0, 0]}
df = pd.DataFrame(data=d, columns=['I','X', 'Y', 'Z', 'W'])
df.set_index('I', inplace=True, drop=True)
out = { i: df.loc[i][df.loc[i] != 0].to_dict() for i in df.index}
print(out)
Result
{'A': {'X': 1, 'Z': 1, 'W': 3}, 'B': {'Y': 1, 'W': 2}, 'C': {'X': 3, 'Y': 2}, 'D': {'X': 1, 'Y': 1}}
Wrapping values in a Set
{ i: df.loc[i][df.loc[i] != 0].apply(lambda x: {x}).to_dict() for i in df.index}
{'A': {'X': {1}, 'Z': {1}, 'W': {3}}, 'B': {'Y': {1}, 'W': {2}}, 'C': {'X': {3}, 'Y': {2}}, 'D': {'X': {1}, 'Y': {1}}}
Testing
Performance against the accepted answer with list of 2000 items shows the accepted answer is 40% slower.
import pandas as pd
import timeit
import json
import random
def create_json(jpath, n, idx_name):
data = {}
for i in range(n):
data[f'i{i}'] = [random.randint(0, n//5) for _ in range(n)]
data[idx_name] = list(data.keys())
with open(jpath, 'w') as j:
json.dump(data, j)
return idx_name
def lmc_method(df):
out = { i: df.loc[i][df.loc[i] != 0].to_dict() for i in df.index}
return out
def vs_method(df):
data_dict = df.to_dict(orient='index')
edge_dictionary = {
node: {attribute: {weight} for attribute, weight in connections.items() if weight != 0}
for node, connections in data_dict.items()}
return edge_dictionary
#d = {'I': ['A', 'B', 'C', 'D'], 'X': [ 1, 0, 3, 1], 'Y': [0, 1, 2, 1], 'Z': [1, 0, 0, 0], 'W': [3, 2, 0, 0]}
n = 2000
jpath = f'/home/lmc/tmp/faker_data_{n}.json'
idx_name = '38cd7657-f731-4ce3-9160-a3fbfc6619dc'
# Edit and uncomment to create test data
#create_json(jpath, n, idx_name)
with open(jpath, 'r') as j:
data = json.load(j)
#print(data)
df = pd.DataFrame(data=data, columns = list(data.keys()))
df.set_index(idx_name, inplace=True, drop=True)
t1 = timeit.timeit(lambda: lmc_method(df), setup="pass",number=3)
print(f"lmc_method: {t1:.2f}")
t2 = timeit.timeit(lambda: vs_method(df), setup="pass",number=3)
print(f"vs_method : {t2:.2f}, {t2/t1 - 1:.2f}")
lmc_method: 5.22
vs_method : 7.31, 0.40
.apply(){k: {k: {v} for k,v in inner.items() if v} for k, inner in df.to_dict(orient="index").items()}.