Scrape multiple pages with selenium

Question

How to scrape multiple pages with selenuim I am trying to scrape multiple pages but They show me error Is there any method share with me I am trying to scrape multiple pages by clicking on button these is page link https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx

import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

options = webdriver.ChromeOptions()

# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")

chrome_driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

productlink=[]
def supplyvan_scraper():
    with chrome_driver as driver:
        driver.implicitly_wait(15)
        URL = 'https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx'
        driver.get(URL)
        time.sleep(3)
        links = driver.find_elements_by_xpath("//div[@class='list-group']//a")
        for link in links:
            link_href = link.get_attribute("href")
            if link_href.startswith("https://www.ifep.ro/"):
                productlink.append(link_href)
              
                
        for k in range(1,5):      
            for product in productlink:
                driver.get(product)
                time.sleep(2)
                title = driver.find_element(By.CSS_SELECTOR, '#HeadingContent_lblTitle').text
                d1 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[1]").text
                d1 = d1.strip()
                d2 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[2]").text
                d2 = d2.strip()
                d3 =driver.find_element_by_xpath(
                    "//div[@class='col-md-10']//p[3]//span").text
                d3 = d3.strip()
                d4 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[4]").text
                d4 = d4.strip()
                
                WebDriverWait(driver, 10).until( EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{k}")) ).click()
                
                
                print(title,d1,d2,d3,d4)
             
                # driver.back()
        time.sleep(2)

        driver.quit()


supplyvan_scraper()

MeT · Accepted Answer · 2022-06-21 20:01:20Z

1

You have some errors:

You are doing click(next page) inside the loop of links.
After visit all links you need to go back to click next page. If you have visited 15 links you will need to go back 15 times or save source URL and go back to source URL.

Better solution:

Crawl all links by click next page only.
Visit all links, crawl all data and print them.

Here you have some code did with playwright:

import time
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.webkit.launch(headless=False)
    baseurl = "https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx"
    page = browser.new_page()
    page.goto(baseurl)
    productlinks = []
    for k in range(1, 10):
        links = page.query_selector_all("//div[@class='list-group']//a")
        for link in links:
            link_href = link.get_attribute("href")
            if link_href.startswith("LawyerFile.aspx"):
                productlinks.append("https://www.ifep.ro/justice/lawyers/" + link_href)
        page.wait_for_selector("#MainContent_PagerTop_NavNext").click()
        time.sleep(2)  # wait for load the page
    for product in productlinks:
        page.goto(product)
        title = page.wait_for_selector('#HeadingContent_lblTitle').text_content()
        d1 = page.wait_for_selector("//div[@class='col-md-10']//p[1]").text_content()
        d1 = d1.strip()
        d2 = page.wait_for_selector("//div[@class='col-md-10']//p[2]").text_content()
        d2 = d2.strip()
        d3 = page.wait_for_selector("//div[@class='col-md-10']//p[3]//span").text_content()
        d3 = d3.strip()
        d4 = page.wait_for_selector("//div[@class='col-md-10']//p[4]").text_content()
        d4 = d4.strip()
        print(title, d1, d2, d3, d4)
    browser.close()

OUTPUT:

ILIE Marius-Constantin, Baroul Ilfov Avocat Definitiv, Baroul Ilfov Dată înscriere: 14-03-2011 ACTIV Instanţe cu drept de concluzii: Toate instanţele
DIN GEORGIANA-CLAUDIA, Baroul Bucureşti Avocat Definitiv, Baroul Bucureşti Dată înscriere: 15-05-2008 ACTIV Instanţe cu drept de concluzii: Toate instanţele
MOLDANSCHI ANDREEA-IOANA, Baroul Bucureşti Avocat Stagiar, Baroul Bucureşti Dată înscriere: 30-05-2022 ACTIV Instanţe cu drept de concluzii: Judecătorii

edited Jun 21, 2022 at 20:01

answered Jun 21, 2022 at 19:48

MeT

7123 gold badges7 silver badges21 bronze badges

Sign up to request clarification or add additional context in comments.

12 Comments

Amen Aziz Over a year ago

╔════════════════════════════════════════════════════════════╗ ║ Looks like Playwright was just installed or updated. ║ ║ Please run the following command to download new browsers: ║ ║ ║ ║ playwright install ║ ║ ║ ║ <3 Playwright Team

Amen Aziz Over a year ago

how to run playwright in command?

MeT Over a year ago

Just install playwright check official doc: playwright.dev/python/docs/intro. I am using PyCharm IDE and no problem to run it from IDE.

Amen Aziz Over a year ago

playwright._impl._api_types.TimeoutError: Timeout 30000ms exceeded.

Amen Aziz Over a year ago

okay I try it you run the code on pycharm

|

Collectives™ on Stack Overflow

Scrape multiple pages with selenium

1 Answer 1

12 Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

12 Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related