I have read multiple StackOverflow articles on this and most of the top 10 Google results. Where my issue deviates is that I am using one script in python to create my JSON files. And the next script, run not 10 minutes later, can't read that very file.
Short version, I generate leads for my online business. I am attempting to learn python in order to have better analytics on these leads. I am scouring 2 years worth of leads with the intent being to retain the useful data and drop anything personal - email addresses, names, etc. - while also saving 30,000+ leads into a few dozen files for easy access.
So my first script opens every single individual lead file - 30,000+ - determines the date it was capture based on a timestamp in the file. Then it saves that lead to the appropriate key in dict. When all the data has been aggregated into this dict text files are written using json.dumps.
The dict's structure is:
addData['lead']['July_2013'] = { ... }
where the 'lead' key can be lead, partial, and a few others and the 'July_2013' key is obviously a date based key that can be any combination of the full month and 2013 or 2014 going back to 'February_2013'.
The full error is this:
ValueError: Unterminated string starting at: line 1 column 9997847 (char 9997846)
But I've manually looked at the file and my IDE says there are only 76,655 chars in the file. So how did it get to 9997846?
The file that fails is the 8th to be read; the other 7 and all other files that come after it read in via json.loads just fine.
Python says there is in an unterminated string so I looked at the end of the JSON in the file that fails and it appears to be fine. I've seen some mention about newlines being \n in JSON but this string is all one line. I've seen mention of \ vs \ but in a quick look over the whole file I didn't see any . Other files do have \ and they read in fine. And, these files were all created by json.dumps.
I can't post the file because it still has personal info in it. Manually attempting to validate the JSON of a 76,000 char file isn't really viable.
Thoughts on how to debug this would be appreciated. In the mean time I am going to try to rebuild the files and see if this wasn't just a one off bug but that takes a while.
- Python 2.7 via Spyder & Anaconda
- Windows 7 Pro
--- Edit --- Per request I am posting the Write Code here:
from p2p.basic import files as f
from p2p.adv import strTools as st
from p2p.basic import strTools as s
import os
import json
import copy
from datetime import datetime
import time
global leadDir
global archiveDir
global aggLeads
def aggregate_individual_lead_files():
"""
"""
# Get the aggLead global and
global aggLeads
# Get all the Files with a 'lead' extension & aggregate them
exts = [
'lead',
'partial',
'inp',
'err',
'nobuyer',
'prospect',
'sent'
]
for srchExt in exts:
agg = {}
leads = f.recursiveGlob(leadDir, '*.cd.' + srchExt)
print "There are {} {} files to process".format(len(leads), srchExt)
for lead in leads:
# Get the Base Filename
fname = f.basename(lead)
#uniqID = st.fetchBefore('.', fname)
#print "File: ", lead
# Get Lead Data
leadData = json.loads(f.file_get_contents(lead))
agg = agg_data(leadData, agg, fname)
aggLeads[srchExt] = copy.deepcopy(agg)
print "Aggregate Top Lvl Keys: ", aggLeads.keys()
print "Aggregate Next Lvl Keys: "
for key in aggLeads:
print "{}: ".format(key)
for arcDate in aggLeads[key].keys():
print "{}: {}".format(arcDate, len(aggLeads[key][arcDate]))
# raw_input("Press Enter to continue...")
def agg_data(leadData, agg, fname=None):
"""
"""
#print "Lead: ", leadData
# Get the timestamp of the lead
try:
ts = leadData['timeStamp']
leadData.pop('timeStamp')
except KeyError:
return agg
leadDate = datetime.fromtimestamp(ts)
arcDate = leadDate.strftime("%B_%Y")
#print "Archive Date: ", arcDate
try:
agg[arcDate][ts] = leadData
except KeyError:
agg[arcDate] = {}
agg[arcDate][ts] = leadData
except TypeError:
print "Timestamp: ", ts
print "Lead: ", leadData
print "Archive Date: ", arcDate
return agg
"""
if fname is not None:
archive_lead(fname, arcDate)
"""
#print "File: {} added to {}".format(fname, arcDate)
return agg
def archive_lead(fname, arcDate):
# Archive Path
newArcPath = archiveDir + arcDate + '//'
if not os.path.exists(newArcPath):
os.makedirs(newArcPath)
# Move the file to the archive
os.rename(leadDir + fname, newArcPath + fname)
def reformat_old_agg_data():
"""
"""
# Get the aggLead global and
global aggLeads
aggComplete = {}
aggPartial = {}
oldAggFiles = f.recursiveGlob(leadDir, '*.cd.agg')
print "There are {} old aggregate files to process".format(len(oldAggFiles))
for agg in oldAggFiles:
tmp = json.loads(f.file_get_contents(agg))
for uniqId in tmp:
leadData = tmp[uniqId]
if leadData['isPartial'] == True:
aggPartial = agg_data(leadData, aggPartial)
else:
aggComplete = agg_data(leadData, aggComplete)
arcData = dict(aggLeads['lead'].items() + aggComplete.items())
aggLeads['lead'] = arcData
arcData = dict(aggLeads['partial'].items() + aggPartial.items())
aggLeads['partial'] = arcData
def output_agg_files():
for ext in aggLeads:
for arcDate in aggLeads[ext]:
arcFile = leadDir + arcDate + '.cd.' + ext + '.agg'
if f.file_exists(arcFile):
tmp = json.loads(f.file_get_contents(arcFile))
else:
tmp = {}
arcData = dict(tmp.items() + aggLeads[ext][arcDate].items())
f.file_put_contents(arcFile, json.dumps(arcData))
def main():
global leadDir
global archiveDir
global aggLeads
leadDir = 'D://Server Data//eagle805//emmetrics//forms//leads//'
archiveDir = leadDir + 'archive//'
aggLeads = {}
# Aggregate all the old individual file
aggregate_individual_lead_files()
# Reformat the old aggregate files
reformat_old_agg_data()
# Write it all out to an aggregate file
output_agg_files()
if __name__ == "__main__":
main()
Here is the read code:
from p2p.basic import files as f
from p2p.adv import strTools as st
from p2p.basic import strTools as s
import os
import json
import copy
from datetime import datetime
import time
global leadDir
global fields
global fieldTimes
global versions
def parse_agg_file(aggFile):
global leadDir
global fields
global fieldTimes
try:
tmp = json.loads(f.file_get_contents(aggFile))
except ValueError:
print "{} failed the JSON load".format(aggFile)
return False
print "Opening: ", aggFile
for ts in tmp:
try:
tmpTs = float(ts)
except:
print "Timestamp: ", ts
continue
leadData = tmp[ts]
for field in leadData:
if field not in fields:
fields[field] = []
fields[field].append(float(ts))
def determine_form_versions():
global fieldTimes
global versions
# Determine all the fields and their start and stop times
times = []
for field in fields:
minTs = min(fields[field])
fieldTimes[field] = [minTs, max(fields[field])]
times.append(minTs)
print 'Min ts: {}'.format(minTs)
times = set(sorted(times))
print "Times: ", times
print "Fields: ", fieldTimes
versions = {}
for ts in times:
d = datetime.fromtimestamp(ts)
ver = d.strftime("%d_%B_%Y")
print "Version: ", ver
versions[ver] = []
for field in fields:
if ts in fields[field]:
versions[ver].append(field)
def main():
global leadDir
global fields
global fieldTimes
leadDir = 'D://Server Data//eagle805//emmetrics//forms//leads//'
fields = {}
fieldTimes = {}
aggFiles = f.glob(leadDir + '*.lead.agg')
for aggFile in aggFiles:
parse_agg_file(aggFile)
determine_form_versions()
print "Versions: ", versions
if __name__ == "__main__":
main()