happy 2021 everyone!
I started coding some months ago for fun and recently I challenged myself to build a JSON parser in Python (v3.8).
The basic idea was to avoid loading the whole file at once, instead parsing a file line by line. But of course I'm still a newb, so there is probably a lot of absurdities in there. Constructive criticism is very welcome!
The full code is here: https://pastebin.com/fEP4n9Gw The sample JSON used to test it is here: https://pastebin.com/587jqziH
EDIT: rewrote the main parsing function to be able to handle compact jsons. It's of course still far from optimal code, but that's not the point. :)
import re
import ast
class TranslateJSON(ast.NodeTransformer):
'''
NodeTransformer to replace null/true/false for None, True and False before evaluating the string.
'''
translate_map = {'null': None, 'true': True, 'false': False}
def visit_Name(self, node):
if node.id in self.translate_map.keys():
return ast.Constant(value=self.translate_map[node.id], kind=None, lineno=node.lineno, col_offset=node.col_offset, end_lineno=node.end_lineno, end_col_offset=node.end_col_offset)
class JSON_parser():
'''
Class has two attributes other than its methods:
'file_path': path of the json file to parse
'map': created via the buildDict() method, which simply evaluates the json file into a dictionary.
The methods that should be called directly are:
read(): accepts one argument, an iterable containing the whole hierarchy of keys to to query (from the outermost to the innermost).
Since this method reads the file one line at a time, it's faster when handling large files. Otherwise buildDict() should be faster.
buildDict(): merely evaluates the whole JSON file into a dictionary, storing it in self.map.
This method can also be used to parse JSON strings directly.
'''
full_value_regex = re.compile(r'^\s*(".+"|null|true|false|\d+\.?\d*)') #pattern to find a non-object, non-array value.
first_char_regex = re.compile(r'^\s*([{\[]).*') #pattern to find out if a value is a JSON object or array
def __init__(self, file_path):
self.file_path = file_path
self.map = None
def cleanString(self, line):
'''
Prepares a string to be parsed (spaces are stripped)
'''
clean_line = line.strip()
return clean_line
def translate_and_eval(self, value):
'''
Replaces the values null, true and false in a captured value for None, True and False.
Then evaluates the string litteraly into python data types.
'''
ast_obj = ast.parse(value, mode='eval')
try:
final_value = ast.literal_eval(ast_obj)
except:
try:
TranslateJSON().visit(ast_obj)
final_value = ast.literal_eval(ast_obj)
except:
raise ValueError(f"JSON malformed. Error evaluating {value}")
return final_value
def buildDict(self, string_to_eval=''):
'''
Reads the whole file and evaluates it into a dictionary, storing it in self.map.
Alternatively, you can pass a JSON as a string argument.
'''
if not string_to_eval:
with open(self.file_path) as source:
for line in source:
string_to_eval += self.cleanString(line)
self.map = self.translate_and_eval(string_to_eval)
def read(self, keys):
'''
Master method to access a value of a JSON file without loading the whole file at once. To be used for large files.
For smaller files, use buildDict() instead.
'keys' has to be a list of all the keys being searched, from outer to innermost. Ex.: self.read(['outerkey','middlekey','finalkey'])
The string value is evaluated literally before being returned.
'''
with open(self.file_path) as file:
value = self._search(keys, file)
value = self.translate_and_eval(value)
return value
def _search(self, keys, file):
'''
Iteratively finds all keys of the hierarchy that is being searched, the last of which will have its position passed to the function _getValue().
Arguments:
keys: list of keys to search, from outer to innermost.
file: since the function is called with the file still open, the file object has to be passed as an argument.
'''
#The variables below help limit the search to a specific part of the file
open_bracket_count = 0
inside_quotes = False #Toggle to ignore curly brackets inside quotes
start_is_set = False #When True, the desired hierarchy depth has been reached and the search can begin
end_is_set = False #Toggles off the search (when a lower/higher hierarchy level is reached)
last_endpos = [0,0] #Ultimately stores the position of the last found key, from which its value can be parsed.
haystack = ''
file.seek(0)
for key_index, key in enumerate(keys):
key_regex = re.compile('("' + key + '"' + r'\s*:)')
match = None
file.seek(0)
for line_number, line in enumerate(file):
if line_number < last_endpos[0]: #skips previous lines
continue
clean_line = self.cleanString(line)
if line_number == last_endpos[0]:
clean_line = clean_line[last_endpos[1]:]
char_index_offset = last_endpos[1] #offsets the character index with the position of the last found key. Allows for parsing the same line multiple times.
else:
char_index_offset = 0
for char_index, char in enumerate(clean_line):
if char == '"':
inside_quotes = not inside_quotes
elif char == '}' and not inside_quotes:
if open_bracket_count-1 == key_index+1 and not start_is_set:
start_is_set = True
elif open_bracket_count-1 == key_index and not end_is_set:
end_is_set = True
open_bracket_count -= 1
elif char == '{' and not inside_quotes:
if open_bracket_count+1 == key_index+1 and not start_is_set:
start_is_set = True
elif open_bracket_count+1 == key_index+2 and not end_is_set:
end_is_set = True
open_bracket_count += 1
if start_is_set:
haystack += char
match = key_regex.search(haystack)
if match:
last_endpos = [line_number, char_index+char_index_offset]
start_is_set, end_is_set = False, False
haystack = ''
break
elif end_is_set:
start_is_set, end_is_set = False, False
haystack = ''
if match:
break
if not match:
raise KeyError(f"{key} not found in file. Last valid key found at line {last_endpos[0]+1} and endchar index {last_endpos[1]}")
if match:
return self._getValue(last_endpos, file)
def _getValue(self, match_end, file):
'''
Once the final key has been found, _getValue() is called to return the actual value of the key.
The function tries to capture the value directly with a regex (when the value is null, a string or a number).
If this fails, it assumes the value is either a JSON object or an array (starting with { or [ respectively)
Arguments:
match_end: a list containing the line where the key was found and the index of the last character of the key in that line. Parsing will start from there.
file: since the function is called with the file still open, the file object has to be passed as an argument.
'''
file.seek(0)
#The variables below help determine which type of data is being parsed (JSON object or array),
#and whether the object/array has been fully captured.
open_bracket = ''
bracket_map = {'{': '}', '[': ']'}
open_bracket_count = 0
close_bracket_count = 0
value = ''
for line_number, line in enumerate(file):
if line_number < match_end[0]:
continue
elif line_number == match_end[0]:
clean_line = self.cleanString(line)
clean_line = clean_line[match_end[1]+1:] #starts parsing the line after the key name
else:
clean_line = self.cleanString(line)
if not open_bracket:
full_value_match = self.full_value_regex.match(clean_line) #first try to match a simple value, instead of obj/array (string, null or number)
if full_value_match:
return full_value_match.group(1)
#If direct match fails, look at first non-whitespace character to determine whether value is a JSON object or array
first_char_match = self.first_char_regex.match(clean_line)
try:
open_bracket = first_char_match.group(1)
except:
raise ValueError(f"Could not retrieve value. JSON is probably malformed. Line: {line_number}")
#the loop below adds characters to the variable 'value' until the whole object/array is captured.
for char in clean_line:
if char == open_bracket:
open_bracket_count += 1
elif char == bracket_map[open_bracket]:
close_bracket_count += 1
if open_bracket_count > 0:
if close_bracket_count == open_bracket_count:
value += char
return value
else:
value += char
if __name__ == '__main__':
'''
import timeit
a="""
pop_map = JSON_parser('pop_map.json')
x = pop_map.read(['investor', 'jewellery', 'consumption'])
y = pop_map.read(['worker', 'fish'])
z = pop_map.read(['scholar'])
"""
b="""
pop_map = JSON_parser('pop_map.json')
pop_map.buildDict()
x = pop_map.map['investor']['jewellery']['consumption']
y = pop_map.map['worker']['fish']
z = pop_map.map['scholar']
"""
c="""
pop_map = JSON_parser('pop_map_compact.json')
x = pop_map.read(['investor', 'jewellery', 'consumption'])
y = pop_map.read(['worker', 'fish'])
z = pop_map.read(['scholar'])
"""
d="""
import json
with open('pop_map.json') as js:
data = json.load(js)
x = data['investor']['jewellery']['consumption']
y = data['worker']['fish']
z = data['scholar']
"""
print(timeit.timeit(stmt=a, setup="from __main__ import JSON_parser", number=500))
print(timeit.timeit(stmt=b, setup="from __main__ import JSON_parser", number=500))
print(timeit.timeit(stmt=c, setup="from __main__ import JSON_parser", number=500))
print(timeit.timeit(stmt=d, setup="from __main__ import JSON_parser", number=500))
'''
'''
pop_map = JSON_parser('pop_map.json')
x = pop_map.read(['investor', 'jewellery', 'consumption'])
y = pop_map.read(['worker', 'market'])
z = pop_map.read(['scholar'])
print(x,y,z, sep='\n\n', end='\n\n\n')
'''
'''
pop_map = JSON_parser('pop_map.json')
pop_map.buildDict()
x = pop_map.map['investor']['jewellery']['consumption']
y = pop_map.map['worker']['fish']
z = pop_map.map['scholar']
print(x,y,z, sep='\n\n')
'''
'''
pop_map = JSON_parser('pop_map_compact.json')
x = pop_map.read(['investor', 'jewellery', 'consumption'])
y = pop_map.read(['worker', 'market'])
z = pop_map.read(['scholar'])
print(x,y,z, sep='\n\n', end='\n\n\n')
'''
Kind regards,
Bernardo
jsonlibrary? \$\endgroup\$