Thanks to @Ignacio for intern() -
def procLog(logName, userDict):
inf = open(logName, 'r')
for ln in inf.readlines():
name,act = ln.split(':')
userDict[name] = intern(act)
inf.close()
return userDict
def doLogs(logNameList):
userDict = {}
for logName in logNameList:
userDict = procLog(logName, userDict)
return userDict
def writeOrderedLog(logName, userDict):
keylist = userDict.keys()
keylist.sort()
outf = open(logName,'w')
for k in keylist:
outf.write(k + ':' + userDict[k])
outf.close()
def main():
mylogs = ['log20101214', 'log20101215', 'log20101216']
d = doLogs(mylogs)
writeOrderedLog('cumulativeLog', d)
the question, then, is how much memory this will consume.
def makeUserName():
ch = random.choice
syl = ['ba','ma','ta','pre','re','cu','pro','do','tru','ho','cre','su','si','du','so','tri','be','hy','cy','ny','quo','po']
# 22**5 is about 5.1 million potential names
return ch(syl).title() + ch(syl) + ch(syl) + ch(syl) + ch(syl)
ch = random.choice
states = ['joined', 'added pic', 'added article', 'added comment', 'voted', 'logged out']
d = {}
t = []
for i in xrange(1000):
for j in xrange(8000):
d[makeUserName()] = ch(states)
t.append( (len(d), sys.getsizeof(d)) )
which results in

(horizontal axis = number of user names, vertical axis = memory usage in bytes) which is... slightly weird. It looks like a dictionary preallocates quite a lot of memory, then doubles it every time it gets too full?
Anyway, 4 million users takes just under 100MB of RAM - but it actually reallocates around 3 million users, 50MB, so if the doubling holds, you will need about 800MB of RAM to process 24 to 48 million users.