0

I'm working on a web scrapper to collect Facebook post comments for analytics purposes.

On Facebook, after login, we can scroll the post page to get all the comments. Which dynamically loads the comments on the page scroll. Unfortunately, I can't get the page to scroll in the headless mode, though it works in non-headless mode.

I have referred the following posts - Post 1 Post 2

Here's my code

import datetime
import re
import time

from decouple import config
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
import yake
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"

options = Options()
options.add_argument('--disable-gpu-sandbox')
options.add_argument('--disable-gpu')
options.add_argument('--disable-software-rasterizer')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
options.add_argument("--window-size=1280,700")
options.add_argument("--headless=new")
options.add_argument(f"--user-agent={user_agent}")

driver = webdriver.Chrome(options=options)

driver.get("https://www.facebook.com/")
email_input = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, "email"))
)
password_input = driver.find_element(By.ID, "pass")

email_input.send_keys(config("FB_EMAIL_INPUT"))
password_input.send_keys(config("FB_PASSWORD_INPUT"))
password_input.send_keys(Keys.RETURN)

time.sleep(1)
try:
    profile = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//div[@aria-label='Your profile']"))
    )
    print("Login successful")
except NoSuchElementException:
    print("Login failed")


POST_URL = "https://www.facebook.com/thebetterindia/posts/pfbid025Yo2f5Qsd8NDL4AoFoHuvjeAURiRVc7rQ4uZBbULMuUWCfZ9NURRfeVha7aPpnn3l"
driver.get(POST_URL)

def infinite_scroll(driver, timeout=10):
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
       driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
       time.sleep(timeout)
       new_height = driver.execute_script("return document.body.scrollHeight")
       if new_height == last_height:
          break

       last_height = new_height

try:
  infinite_scroll(driver, timeout=2)
except Exception as e:
  print(f"An exception occurred: {e}")

1 Answer 1

0

I had the same issue sending scrolling scripts to headless selenium. Headless browsers sometimes do not handle certain actions like scrolling in the same way as browsers with a GUI.

You can try with ActionChains insteed.

from selenium.webdriver.common.action_chains import ActionChains

def infinite_scroll(driver, timeout=10):
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Use ActionChains to scroll
        ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
        time.sleep(timeout)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break

        last_height = new_height

Instead of window.scrollTo, ActionChains is used to send Keys.PAGE_DOWN. This more closely mimics user interaction.

If Keys.PAGE_DOWN does not scroll sufficiently, try using Keys.SPACE or a combination of multiple PAGE_DOWN actions and adjust the timeout to ensure that the page has enough time to load new elements after each scroll.

Hope it works for you!

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.