I am scraping HTML pages. Part of the page has a table which has acts and sections of those acts mentioned in table format. For some other project I need to convert them to Dictionary. The key values are previously set (in the other project). I want to use the same key values for the dictionary and then replace corresponding sections with each new input. The code I have designed works but I am looking for better way to write it. Presently the code looks quite lengthy. The code:
from bs4 import BeautifulSoup as bs, NavigableString
openFile = open('/some path/samplePage.html')
soup = bs(openFile, 'html.parser')
acts = soup.select('#act_table td:nth-of-type(1)')
sections = soup.select('#act_table td:nth-of-type(2)')
dictionary = {}
ipc = 'indian penal code'
poa = 'prevention of atrocities'
pcso = 'protection of children from sexual'
pcr = 'protection of civil rights'
if len(acts) < 1:
print('no act mentioned')
elif len(acts) < 2:
act1 = tuple(acts[0].contents)
sections1 = tuple(sections[0].contents)
elif len(acts) < 3:
act1 = tuple(acts[0].contents)
sections1 = tuple(sections[0].contents)
act2 = tuple(acts[1].contents)
sections2 = tuple(sections[1].contents)
elif len(acts) < 4:
act1 = tuple(acts[0].contents)
sections1 = tuple(sections[0].contents)
act2 = tuple(acts[1].contents)
sections2 = tuple(sections[1].contents)
act3 = tuple(acts[2].contents)
sections3 = tuple(sections[2].contents)
elif len(acts) < 5:
act1 = tuple(acts[0].contents)
sections1 = tuple(sections[0].contents)
act2 = tuple(acts[1].contents)
sections2 = tuple(sections[1].contents)
act3 = tuple(acts[2].contents)
sections3 = tuple(sections[2].contents)
act4 = tuple(acts[3].contents)
sections4 = tuple(sections[3].contents)
else:
act1 = tuple(acts[0].contents)
sections1 = tuple(sections[0].contents)
act2 = tuple(acts[1].contents)
sections2 = tuple(sections[1].contents)
act3 = tuple(acts[2].contents)
sections3 = tuple(sections[2].contents)
act4 = tuple(acts[3].contents)
sections4 = tuple(sections[3].contents)
act5 = tuple(acts[4].contents)
if len(acts) == 0:
pass
# for first act in list
elif len(acts) == 1:
if ipc in str(act1).lower():
dictionary['IPC'] = sections1
elif poa in str(act1).lower():
dictionary['PoA'] = sections1
elif pcso in str(act1).lower():
dictionary['PCSO'] = sections1
elif pcr in str(act1).lower():
dictionary['PCR'] = sections1
else:
dictionary['Any Other Act'] = str(act1).lower()
print(dictionary)
# for 2nd act in list
elif len(acts) == 2:
if ipc in str(act1).lower():
dictionary['IPC'] = sections1
elif poa in str(act1).lower():
dictionary['PoA'] = sections1
elif pcso in str(act1).lower():
dictionary['PCSO'] = sections1
else:
dictionary['Any Other Act'] = str(act1).lower()
if ipc in str(act2).lower():
dictionary['IPC'] = sections2
elif poa in str(act2).lower():
dictionary['PoA'] = sections2
elif pcso in str(act2).lower():
dictionary['PCSO'] = sections2
else:
dictionary['Any Other Act'] = act2
print(dictionary)
# for 3rd act in list
elif len(acts) == 3:
if ipc in str(act1).lower():
dictionary['IPC'] = sections1
elif poa in str(act1).lower():
dictionary['PoA'] = sections1
elif pcso in str(act1).lower():
dictionary['PCSO'] = sections1
elif pcr in str(act1).lower():
dictionary['PCR'] = sections1
else:
dictionary['Any Other Act'] = str(act1).lower()
if ipc in str(act2).lower():
dictionary['IPC'] = sections2
elif poa in str(act2).lower():
dictionary['PoA'] = sections2
elif pcso in str(act2).lower():
dictionary['PCSO'] = sections2
elif pcr in str(act2).lower():
dictionary['PCR'] = sections2
else:
dictionary['Any Other Act'] = act2
#for 3rd option
if ipc in str(act3).lower():
dictionary['IPC'] = sections3
elif poa in str(act3).lower():
dictionary['PoA'] = sections3
elif pcso in str(act3).lower():
dictionary['PCSO'] = sections3
elif pcr in str(act3).lower():
dictionary['PCR'] = sections3
else:
dictionary['Any Other Act'] = act3
print(dictionary)
# for 4th act in list
elif len(acts) == 4:
if ipc in str(act1).lower():
dictionary['IPC'] = sections1
elif poa in str(act1).lower():
dictionary['PoA'] = sections1
elif pcso in str(act1).lower():
dictionary['PCSO'] = sections1
elif pcr in str(act1).lower():
dictionary['PCR'] = sections1
else:
dictionary['Any Other Act'] = str(act1).lower()
if ipc in str(act2).lower():
dictionary['IPC'] = sections2
elif poa in str(act2).lower():
dictionary['PoA'] = sections2
elif pcso in str(act2).lower():
dictionary['PCSO'] = sections2
elif pcr in str(act2).lower():
dictionary['PCR'] = sections2
else:
dictionary['Any Other Act'] = act2
# for 3rd option
if ipc in str(act3).lower():
dictionary['IPC'] = sections3
elif poa in str(act3).lower():
dictionary['PoA'] = sections3
elif pcso in str(act3).lower():
dictionary['PCSO'] = sections3
elif pcr in str(act3).lower():
dictionary['PCR'] = sections3
else:
dictionary['Any Other Act'] = act3
# 4th Option
if ipc in str(act4).lower():
dictionary['IPC'] = sections4
elif poa in str(act4).lower():
dictionary['PoA'] = sections4
elif pcso in str(act4).lower():
dictionary['PCSO'] = sections4
elif pcr in str(act4).lower():
dictionary['PCR'] = sections4
else:
dictionary['Any Other Act'] = act4
elif len(acts) == 5:
if ipc in str(act1).lower():
dictionary['IPC'] = sections1
elif poa in str(act1).lower():
dictionary['PoA'] = sections1
elif pcso in str(act1).lower():
dictionary['PCSO'] = sections1
elif pcr in str(act1).lower():
dictionary['PCR'] = sections1
else:
dictionary['Any Other Act'] = str(act1).lower()
if ipc in str(act2).lower():
dictionary['IPC'] = sections2
elif poa in str(act2).lower():
dictionary['PoA'] = sections2
elif pcso in str(act2).lower():
dictionary['PCSO'] = sections2
elif pcr in str(act2).lower():
dictionary['PCR'] = sections2
else:
dictionary['Any Other Act'] = act2
# for 3rd option
if ipc in str(act3).lower():
dictionary['IPC'] = sections3
elif poa in str(act3).lower():
dictionary['PoA'] = sections3
elif pcso in str(act3).lower():
dictionary['PCSO'] = sections3
elif pcr in str(act3).lower():
dictionary['PCR'] = sections3
else:
dictionary['Any Other Act'] = act3
# 4th Option
if ipc in str(act4).lower():
dictionary['IPC'] = sections4
elif poa in str(act4).lower():
dictionary['PoA'] = sections4
elif pcso in str(act4).lower():
dictionary['PCSO'] = sections4
elif pcr in str(act4).lower():
dictionary['PCR'] = sections4
else:
dictionary['Any Other Act'] = act4
print(dictionary)
The HTML code of one of the files is here: