Python For loop inside for loop

Question

Can someone help me with the for loop inside for loop (pdfname)?

The output should be: Roco 23380 Instructions (DE), Roco 23380 (DE), ...

I have this output now:

This is source:

This is my code:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import xlsxwriter
import re
import os

productlinks = []

for x in range(1, 2):
    r = requests.get(
        f'https://www.roco.cc/ren/products/locomotives/steam-locomotives.html?p={x}&verfuegbarkeit_status=41%2C42%2C43%2C45%2C44')
    soup = BeautifulSoup(r.content, 'lxml')
    productlist = soup.find_all('li', class_='item product product-item')

    for item in productlist:
        for link in item.find_all('a', class_='product-item-link', href=True):
            productlinks.append(link['href'])

pdflist = []

for url in productlinks:
    r = requests.get(url, allow_redirects=False)
    soup = BeautifulSoup(r.content, 'html.parser')
    for tag in soup.find_all('a'):
        on_click = tag.get('onclick')
        if on_click:
            pdf = re.findall(r"'([^']*)'", on_click)[0]
            if 'pdf' in pdf:

                name = 'Roco'

            try:
                reference = soup.find(
                    'span', class_='product-head-artNr').get_text().strip()
            except Exception as e:
                print(e)

            
            try:
                pdfname = soup.find('td', class_='col-download-data').get_text().strip()
            except Exception as e:
                print(e)

            print(name, reference, pdfname)

I can only generally say that you call "soup.find(...)" in the inner loop but "soup" only changes before the inner loop. This means that these calls always return the same results until a new iteration of the outer loop begins. — Michael Butscher
– Michael Butscher, Commented Dec 11, 2022 at 21:25

Teddy · Accepted Answer · 2022-12-11 21:40:49Z

You can use findAll instead of find to get all names and then use a variable to keep track of which pdfname should be used.

for url in productlinks:
    r = requests.get(url, allow_redirects=False)
    soup = BeautifulSoup(r.content, 'html.parser')
    # set variable to 0
    num_of_pdfs = 0
    for tag in soup.find_all('a'):
        on_click = tag.get('onclick')
        if on_click:
            pdf = re.findall(r"'([^']*)'", on_click)[0]

            if 'pdf' in pdf:
                name = 'Roco'

            try:
                reference = soup.find(
                    'span', class_='product-head-artNr').get_text().strip()
            except Exception as e:
                print(e)

            try:
                # use find all and use the current pdf as index
                pdfname = soup.findAll('td', class_='col-download-data')[num_of_pdfs].get_text().strip()
                # increment num_of_pdfs to get the next name on next iteration
                num_of_pdfs += 1
            except Exception as e:
                print(e)

            print(name, reference, pdfname)

PRStark · Accepted Answer · 2022-12-11 21:57:11Z

1

Replace this

    try:
        pdfname = soup.find('td', class_='col-download-data').get_text().strip()
    except Exception as e:
        print(e)

with this:

        try:
            pdfname = ""
            for tag in soup.find_all('td',  class_='col-download-data'):
              pdfname = pdfname + "," + tag.get_text().strip()
            
        except Exception as e:
          print(e)

answered Dec 11, 2022 at 21:57

PRStark

1,2668 silver badges20 bronze badges

Comments

Filip Chobodicky · Accepted Answer · 2022-12-12 00:19:13Z

Thanks for help, guys. Finished one part of code:

    try:
        os.mkdir(os.path.join(os.getcwd(), pdffolder))
    except:
        pass
    os.chdir(os.path.join(os.getcwd(), pdffolder))

    for url in productlinks:
        r = requests.get(url, allow_redirects=False)
        soup = BeautifulSoup(r.content, 'html.parser')
        num_of_pdfs = 0
        for tag in soup.find_all('a'):
            on_click = tag.get('onclick')
            if on_click:
                pdf = re.findall(r"'([^']*)'", on_click)[0]
                if 'pdf' in pdf:

                    name = 'Roco'

                try:
                    reference = soup.find(
                        'span', class_='product-head-artNr').get_text().strip()
                except Exception as e:
                    print(e)

                try:
                    pdfname = soup.findAll(
                        'td', class_='col-download-data')[num_of_pdfs].get_text().strip().lower()
                    pdfname = pdfname.replace(' ', '_')
                    num_of_pdfs += 1
                except Exception as e:
                    print(e)

                pdflist.append(pdf)

                with open(name + '-' + reference + '-' + pdfname + '-' + '.pdf', 'wb') as f:
                    im = requests.get(pdf)
                    f.write(im.content)

                pdfs = {
                    'Manufacturer_name': name,
                    'Reference': reference,
                    'Documents': name + '_' + reference + '_' + pdfname + '.pdf'
                }

                doculist.append(pdfs)


pdfpath('Rocco - pdf')

Collectives™ on Stack Overflow

Python For loop inside for loop

3 Answers 3

Comments

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

3 Answers 3

Comments

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related