I am attempting to create an efficient algorithm to pull all of the similar elements between two lists. The problem is two fold. First, I can not seem to find any similar algorithms online. Second, there should be a more efficient way.
By 'similar', I mean a predesigned way to find all similar elements between two lists in a timely fashion.
Currently, I am taking a greedy approach by:
- Sorting the lists that are being compared,
- Comparing each element in the shorter list to each element in the larger list,
- Since the
largeListandsmallListare sorted we can save the last index that was visited, - Continue from the previous index (
largeIndex).
Currently, the run-time seems to be average of O(nlog(n)). This can be seen by running the test cases listed after this block of code.
This is being run on Python 3.3.
Right now, my code looks as such:
def compare(small,large,largeStart,largeEnd):
for i in range(largeStart, largeEnd):
if small==large[i]:
return [1,i]
if small<large[i]:
if i!=0:
return [0,i-1]
else:
return [0, i]
return [0,largeStart]
def determineLongerList(aList, bList):
if len(aList)>len(bList):
return (aList, bList)
elif len(aList)<len(bList):
return (bList, aList)
else:
return (aList, bList)
def compareElementsInLists(aList, bList):
import time
startTime = time.time()
holder = determineLongerList(aList, bList)
sameItems = []
iterations = 0
##########################################
smallList = sorted(holder[1])
smallLength = len(smallList)
smallIndex = 0
largeList = sorted(holder[0])
largeLength = len(largeList)
largeIndex = 0
while (smallIndex<smallLength):
boolean = compare(smallList[smallIndex],largeList,largeIndex,largeLength)
if boolean[0]==1:
#`compare` returns 1 as True
sameItems.append(smallList[smallIndex])
oldIndex = largeIndex
largeIndex = boolean[1]
else:
#else no match and possible new index
oldIndex = largeIndex
largeIndex = boolean[1]
smallIndex+=1
iterations =largeIndex-oldIndex+iterations+1
print('RAN {it} OUT OF {mathz} POSSIBLE'.format(it=iterations, mathz=smallLength*largeLength))
print('RATIO:\t\t'+str(iterations/(smallLength*largeLength))+'\n')
return sameItems
, and here are some test cases:
def testLargest():
import time
from random import randint
print('\n\n******************************************\n')
start_time = time.time()
lis = []
for i in range(0,1000000):
ran = randint(0,1000000)
lis.append(ran)
lis2 = []
for i in range(0,1000000):
ran = randint(0,1000000)
lis2.append(ran)
timeTaken = time.time()-start_time
print('CREATING LISTS TOOK:\t\t'+str(timeTaken))
print('\n******************************************')
start_time = time.time()
c = compareElementsInLists(lis, lis2)
timeTaken = time.time()-start_time
print('COMPARING LISTS TOOK:\t\t'+str(timeTaken))
print('NUMBER OF SAME ITEMS:\t\t'+str(len(c)))
print('\n******************************************')
#testLargest()
'''
One rendition of testLargest:
******************************************
CREATING LISTS TOOK: 21.009342908859253
******************************************
RAN 999998 OUT OF 1000000000000 POSSIBLE
RATIO: 9.99998e-07
COMPARING LISTS TOOK: 13.99990701675415
NUMBER OF SAME ITEMS: 632328
******************************************
'''
def testLarge():
import time
from random import randint
print('\n\n******************************************\n')
start_time = time.time()
lis = []
for i in range(0,1000000):
ran = randint(0,100)
lis.append(ran)
lis2 = []
for i in range(0,1000000):
ran = randint(0,100)
lis2.append(ran)
timeTaken = time.time()-start_time
print('CREATING LISTS TOOK:\t\t'+str(timeTaken))
print('\n******************************************')
start_time = time.time()
c = compareElementsInLists(lis, lis2)
timeTaken = time.time()-start_time
print('COMPARING LISTS TOOK:\t\t'+str(timeTaken))
print('NUMBER OF SAME ITEMS:\t\t'+str(len(c)))
print('\n******************************************')
testLarge()
setonce the data becomes large enough. Also,setdoes not return he entire collection, but just similar elements. I am looking intocollections.Counternow... \$\endgroup\$