1

Can someone help me with the for loop inside for loop (pdfname)?

The output should be: Roco 23380 Instructions (DE), Roco 23380 (DE), ...

I have this output now:

enter image description here

This is source:

enter image description here

This is my code:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import xlsxwriter
import re
import os

productlinks = []

for x in range(1, 2):
    r = requests.get(
        f'https://www.roco.cc/ren/products/locomotives/steam-locomotives.html?p={x}&verfuegbarkeit_status=41%2C42%2C43%2C45%2C44')
    soup = BeautifulSoup(r.content, 'lxml')
    productlist = soup.find_all('li', class_='item product product-item')

    for item in productlist:
        for link in item.find_all('a', class_='product-item-link', href=True):
            productlinks.append(link['href'])

pdflist = []

for url in productlinks:
    r = requests.get(url, allow_redirects=False)
    soup = BeautifulSoup(r.content, 'html.parser')
    for tag in soup.find_all('a'):
        on_click = tag.get('onclick')
        if on_click:
            pdf = re.findall(r"'([^']*)'", on_click)[0]
            if 'pdf' in pdf:

                name = 'Roco'

            try:
                reference = soup.find(
                    'span', class_='product-head-artNr').get_text().strip()
            except Exception as e:
                print(e)

            
            try:
                pdfname = soup.find('td', class_='col-download-data').get_text().strip()
            except Exception as e:
                print(e)

            print(name, reference, pdfname)
1
  • I can only generally say that you call "soup.find(...)" in the inner loop but "soup" only changes before the inner loop. This means that these calls always return the same results until a new iteration of the outer loop begins. Commented Dec 11, 2022 at 21:25

3 Answers 3

1

You can use findAll instead of find to get all names and then use a variable to keep track of which pdfname should be used.

for url in productlinks:
    r = requests.get(url, allow_redirects=False)
    soup = BeautifulSoup(r.content, 'html.parser')
    # set variable to 0
    num_of_pdfs = 0
    for tag in soup.find_all('a'):
        on_click = tag.get('onclick')
        if on_click:
            pdf = re.findall(r"'([^']*)'", on_click)[0]

            if 'pdf' in pdf:
                name = 'Roco'

            try:
                reference = soup.find(
                    'span', class_='product-head-artNr').get_text().strip()
            except Exception as e:
                print(e)

            try:
                # use find all and use the current pdf as index
                pdfname = soup.findAll('td', class_='col-download-data')[num_of_pdfs].get_text().strip()
                # increment num_of_pdfs to get the next name on next iteration
                num_of_pdfs += 1
            except Exception as e:
                print(e)

            print(name, reference, pdfname)
Sign up to request clarification or add additional context in comments.

Comments

1

Replace this

    try:
        pdfname = soup.find('td', class_='col-download-data').get_text().strip()
    except Exception as e:
        print(e)

with this:

        try:
            pdfname = ""
            for tag in soup.find_all('td',  class_='col-download-data'):
              pdfname = pdfname + "," + tag.get_text().strip()
            
        except Exception as e:
          print(e)

Comments

1

Thanks for help, guys. Finished one part of code:

    try:
        os.mkdir(os.path.join(os.getcwd(), pdffolder))
    except:
        pass
    os.chdir(os.path.join(os.getcwd(), pdffolder))

    for url in productlinks:
        r = requests.get(url, allow_redirects=False)
        soup = BeautifulSoup(r.content, 'html.parser')
        num_of_pdfs = 0
        for tag in soup.find_all('a'):
            on_click = tag.get('onclick')
            if on_click:
                pdf = re.findall(r"'([^']*)'", on_click)[0]
                if 'pdf' in pdf:

                    name = 'Roco'

                try:
                    reference = soup.find(
                        'span', class_='product-head-artNr').get_text().strip()
                except Exception as e:
                    print(e)

                try:
                    pdfname = soup.findAll(
                        'td', class_='col-download-data')[num_of_pdfs].get_text().strip().lower()
                    pdfname = pdfname.replace(' ', '_')
                    num_of_pdfs += 1
                except Exception as e:
                    print(e)

                pdflist.append(pdf)

                with open(name + '-' + reference + '-' + pdfname + '-' + '.pdf', 'wb') as f:
                    im = requests.get(pdf)
                    f.write(im.content)

                pdfs = {
                    'Manufacturer_name': name,
                    'Reference': reference,
                    'Documents': name + '_' + reference + '_' + pdfname + '.pdf'
                }

                doculist.append(pdfs)


pdfpath('Rocco - pdf')

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.