0

How to scrape multiple pages with selenuim I am trying to scrape multiple pages but They show me error Is there any method share with me I am trying to scrape multiple pages by clicking on button these is page link https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx

import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

options = webdriver.ChromeOptions()

# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")

chrome_driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

productlink=[]
def supplyvan_scraper():
    with chrome_driver as driver:
        driver.implicitly_wait(15)
        URL = 'https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx'
        driver.get(URL)
        time.sleep(3)
        links = driver.find_elements_by_xpath("//div[@class='list-group']//a")
        for link in links:
            link_href = link.get_attribute("href")
            if link_href.startswith("https://www.ifep.ro/"):
                productlink.append(link_href)
              
                
        for k in range(1,5):      
            for product in productlink:
                driver.get(product)
                time.sleep(2)
                title = driver.find_element(By.CSS_SELECTOR, '#HeadingContent_lblTitle').text
                d1 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[1]").text
                d1 = d1.strip()
                d2 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[2]").text
                d2 = d2.strip()
                d3 =driver.find_element_by_xpath(
                    "//div[@class='col-md-10']//p[3]//span").text
                d3 = d3.strip()
                d4 = driver.find_element_by_xpath("//div[@class='col-md-10']//p[4]").text
                d4 = d4.strip()
                
                WebDriverWait(driver, 10).until( EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{k}")) ).click()
                
                
                print(title,d1,d2,d3,d4)
             
                # driver.back()
        time.sleep(2)

        driver.quit()


supplyvan_scraper()

1 Answer 1

1

You have some errors:

  1. You are doing click(next page) inside the loop of links.
  2. After visit all links you need to go back to click next page. If you have visited 15 links you will need to go back 15 times or save source URL and go back to source URL.

Better solution:

  1. Crawl all links by click next page only.
  2. Visit all links, crawl all data and print them.

Here you have some code did with playwright:

import time
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.webkit.launch(headless=False)
    baseurl = "https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx"
    page = browser.new_page()
    page.goto(baseurl)
    productlinks = []
    for k in range(1, 10):
        links = page.query_selector_all("//div[@class='list-group']//a")
        for link in links:
            link_href = link.get_attribute("href")
            if link_href.startswith("LawyerFile.aspx"):
                productlinks.append("https://www.ifep.ro/justice/lawyers/" + link_href)
        page.wait_for_selector("#MainContent_PagerTop_NavNext").click()
        time.sleep(2)  # wait for load the page
    for product in productlinks:
        page.goto(product)
        title = page.wait_for_selector('#HeadingContent_lblTitle').text_content()
        d1 = page.wait_for_selector("//div[@class='col-md-10']//p[1]").text_content()
        d1 = d1.strip()
        d2 = page.wait_for_selector("//div[@class='col-md-10']//p[2]").text_content()
        d2 = d2.strip()
        d3 = page.wait_for_selector("//div[@class='col-md-10']//p[3]//span").text_content()
        d3 = d3.strip()
        d4 = page.wait_for_selector("//div[@class='col-md-10']//p[4]").text_content()
        d4 = d4.strip()
        print(title, d1, d2, d3, d4)
    browser.close()

OUTPUT:

ILIE Marius-Constantin, Baroul Ilfov Avocat Definitiv, Baroul Ilfov Dată înscriere: 14-03-2011 ACTIV Instanţe cu drept de concluzii: Toate instanţele
DIN GEORGIANA-CLAUDIA, Baroul Bucureşti Avocat Definitiv, Baroul Bucureşti Dată înscriere: 15-05-2008 ACTIV Instanţe cu drept de concluzii: Toate instanţele
MOLDANSCHI ANDREEA-IOANA, Baroul Bucureşti Avocat Stagiar, Baroul Bucureşti Dată înscriere: 30-05-2022 ACTIV Instanţe cu drept de concluzii: Judecătorii
Sign up to request clarification or add additional context in comments.

12 Comments

╔════════════════════════════════════════════════════════════╗ ║ Looks like Playwright was just installed or updated. ║ ║ Please run the following command to download new browsers: ║ ║ ║ ║ playwright install ║ ║ ║ ║ <3 Playwright Team
how to run playwright in command?
Just install playwright check official doc: playwright.dev/python/docs/intro. I am using PyCharm IDE and no problem to run it from IDE.
playwright._impl._api_types.TimeoutError: Timeout 30000ms exceeded.
okay I try it you run the code on pycharm
|

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.