I'm learning Python by trying to write a script to scrape xHamster. If anyone's familiar with the site, I'm trying to specifically write all URLs of a given user's videos to a .txt file.
Currently, I've managed to scrape the URLs off a specific page, however there are multiple pages and I'm struggling to loop through the number of pages.
In my attempt below I've commented where I'm trying to read the URL of the next page, however it current prints None. Any ideas why and how to resolve this?
Current script:
#!/usr/bin/env python
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--incognito")
driver = webdriver.Chrome(chrome_options=chrome_options)
username = **ANY_USERNAME**
##page = 1
url = "https://xhams***.com/user/video/" + username + "/new-1.html"
driver.implicitly_wait(10)
driver.get(url)
links = [];
links = driver.find_elements_by_class_name('hRotator')
#nextPage = driver.find_elements_by_class_name('last')
noOfLinks = len(links)
count = 0
file = open('x--' + username + '.txt','w')
while count < noOfLinks:
#print links[count].get_attribute('href')
file.write(links[count].get_attribute('href') + '\n');
count += 1
file.close()
driver.close()
My attempt at looping through the pages:
#!/usr/bin/env python
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--incognito")
driver = webdriver.Chrome(chrome_options=chrome_options)
username = **ANY_USERNAME**
##page = 1
url = "https://xhams***.com/user/video/" + username + "/new-1.html"
driver.implicitly_wait(10)
driver.get(url)
links = [];
links = driver.find_elements_by_class_name('hRotator')
#nextPage = driver.find_elements_by_class_name('colR')
## TRYING TO READ THE NEXT PAGE HERE
print driver.find_element_by_class_name('last').get_attribute('href')
noOfLinks = len(links)
count = 0
file = open('x--' + username + '.txt','w')
while count < noOfLinks:
#print links[count].get_attribute('href')
file.write(links[count].get_attribute('href') + '\n');
count += 1
file.close()
driver.close()
UPDATE:
I've used Philippe Oger's answer below but modified the two methods below to work for single page results:
def find_max_pagination(self):
start_url = 'https://www.xhamster.com/user/video/{}/new-1.html'.format(self.user)
r = requests.get(start_url)
tree = html.fromstring(r.content)
abc = tree.xpath('//div[@class="pager"]/table/tr/td/div/a')
if tree.xpath('//div[@class="pager"]/table/tr/td/div/a'):
self.max_page = max(
[int(x.text) for x in tree.xpath('//div[@class="pager"]/table/tr/td/div/a') if x.text not in [None, '...']]
)
else:
self.max_page = 1
return self.max_page
def generate_listing_urls(self):
if self.max_page == 1:
pages = [self.paginated_listing_page(str(page)) for page in range(0, 1)]
else:
pages = [self.paginated_listing_page(str(page)) for page in range(0, self.max_page)]
return pages