1

I am trying to making a csv file for shopify store to upload, According to Shopify, you must do the following to add multiple images when importing:

Insert new rows (one per picture).

Copy + paste the "handle".

Copy + paste the image URLs.

Thus, the first image goes in the first row, and all subsequent images go in rows below. The example CSV is located here: https://help.shopify.com/csv/product_template.csv

I would like to program something that will loop through an array, which looks like the following (except significantly longer), and converts it to a CSV, putting all the photos except the first into a new row. Here is my attempted code:

import scrapy
from scrapy.crawler import CrawlerProcess
import csv


class SweetPeaAndWillowSpider(scrapy.Spider):
    name = "sweetpea_and_willow"

    custom_settings = {
        # "FEED_FORMAT": "csv",
        # "FEED_URI": "malabar_furniture.csv",
        "LOG_FILE": "malabar_furniture_shopify.log",
    }

    data = []

    headers = {
        "authority": "www.sweetpeaandwillow.com",
        "cache-control": "max-age=0",
        "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Yandex";v="22"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.141 YaBrowser/22.3.3.886 (beta) Yowser/2.5 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "sec-fetch-site": "same-origin",
        "sec-fetch-mode": "navigate",
        "sec-fetch-user": "?1",
        "sec-fetch-dest": "document",
        "accept-language": "en,ru;q=0.9",
    }

    cookies = {
        "amzn-checkout-session": "%7B%7D",
        "_fbp": "fb.1.1652394481944.1343184112",
        "_pin_unauth": "dWlkPU56VmhNak5rTUdVdE1EVmhaQzAwTkdabExXRm1PREF0TnpOak9XRXdOek5rTjJFeg",
        "_ga": "GA1.2.752968178.1652394485",
        "SPSI": "4eea709914a47dc1f5575f79dc373b51",
        "SPSE": "oc1iOVbm463lrWtCnix8S1Zlf9aGvPeKg7TG7d/WQXvAZjkksosjO/BSl80SLUWb/O8aqo3+lQSH9B1gMRWVdQ==",
        "PHPSESSID": "n6mfpugp82troila6hfib78q3k",
        "UTGv2": "h483379466221b95c6e78e9eb01940db0f64",
        "_hjSessionUser_2692700": "eyJpZCI6ImQ0MDU3M2YzLWM0YjItNTJjMS04YzNiLTM4NzcyMWI5MGY0MyIsImNyZWF0ZWQiOjE2NTIzOTQ0ODI4MTAsImV4aXN0aW5nIjp0cnVlfQ==",
        "_hjIncludedInSessionSample": "0",
        "_hjSession_2692700": "eyJpZCI6ImExOWI0YjI5LTcxODYtNGU5Ny05Y2UwLTVjYmFmODQ0MWZjYiIsImNyZWF0ZWQiOjE2NTI1OTk3NDU3MTAsImluU2FtcGxlIjpmYWxzZX0=",
        "_hjAbsoluteSessionInProgress": "0",
        "form_key": "LCm4cy48SHYhBX3C",
        "_gid": "GA1.2.1948251329.1652599747",
        "_gat": "1",
        "mage-cache-storage": "%7B%7D",
        "mage-cache-storage-section-invalidation": "%7B%7D",
        "mage-cache-sessid": "true",
        "recently_viewed_product": "%7B%7D",
        "recently_viewed_product_previous": "%7B%7D",
        "recently_compared_product": "%7B%7D",
        "recently_compared_product_previous": "%7B%7D",
        "product_data_storage": "%7B%7D",
        "section_data_ids": "%7B%22cart%22%3A1652599747%7D",
        "newsletter-popup-form": "declined",
        "spcsrf": "ef84c17476941fe30a45db5a0a4b8686",
        "sp_lit": "JHxME1OUKp+83P5XseqYpg==",
        "PRLST": "AH",
        "adOtr": "7ae049U19a4",
    }

    def start_requests(self):
        yield scrapy.Request(
            "https://www.sweetpeaandwillow.com/brands/emotional-brands/malabar?p=1",
            headers=self.headers,
            cookies=self.cookies,
            callback=self.parse_urls,
        )

    def parse_urls(self, response):
        url_list = response.css("div.item.product-item")

        for link in url_list:
            url = link.css("a::attr(href)").get()
            yield scrapy.Request(
                url=url,
                headers=self.headers,
                cookies=self.cookies,
                callback=self.parse_details,
            )

    def parse_details(self, response):
        table = response.css("table.data.table.additional-attributes")
        for tr in table.css("tbody"):
            row = tr.css("tr")
            color = row[0].css("td::text").get()
            dimension = row[1].css("td::text").get()
            material = row[2].css("td::text").get()
        self.data.append(
            {
                "Handle": response.css("h1.page-title ::text").get().lower(),
                "Title": response.css("h1.page-title ::text").get(),
                "Descritpion": response.css(
                    "div#description_product_show > p::text"
                ).get(),
                "Price": response.css("div.original-pricing-wrapper")
                .css("span.price ::text")
                .getall()[28],
                "Delivery": response.css("p.availability-message > span::text").get(),
                "Color": color,
                "Dimensions": dimension,
                "Material": material,
                "Image_Src": response.css("div.MagicSlideshow")
                .css("a img::attr(src)")
                .getall(),
            }
        )

        # print(self.data)

        f = csv.writer(open("malabar_furniture_shopify.csv", "w", newline=""))
        f.writerow(
            [
                "Handle",
                "Title",
                "Descritpion",
                "Price",
                "Delivery",
                "Color",
                "Dimensions",
                "Material",
                "Image_Src",
            ]
        )

        for d in self.data:
            images = d["Image_Src"]
            f.writerow(
                [
                    d["Handle"],
                    d["Title"],
                    d["Descritpion"],
                    d["Price"],
                    d["Delivery"],
                    d["Color"],
                    d["Dimensions"],
                    d["Material"],
                    images.pop(0) if images else None,
                ]
            )

            while images:
                f.writerow(
                    [None, None, None, None, None, None, None, None, images.pop(0)]
                )
if __name__ == "__main__":
    process = CrawlerProcess()

    process.crawl(SweetPeaAndWillowSpider)

    process.start()

output: enter image description here

Update: I tried opening the file at the satrt and define the headers as well but no difference. I tried using a appending to the file it makes duplicate entries with duplicate headers.

I am getting Image_Src links only for one product which is the last one. Anyone knows how to fix it? Thanks

0

1 Answer 1

0

You are creating and writing "malabar_furniture_shopify.csv" for each response. The result being that you will only ever see the final entry, as all other entries will be overwritten.

One possible workaround would be to append your results:

with open("malabar_furniture_shopify.csv", "a", newline="") as csvfile:

You would then need a flag to ensure the header is only written for your first entry. newline="" is used to ensure you don't see extra blank rows in the output.

A better approach would be to open the file at the start and write the header once. Then use the same file handle to write each row. At then end ensure the file is closed.

Try the following:

import scrapy
from scrapy.crawler import CrawlerProcess
import csv


class SweetPeaAndWillowSpider(scrapy.Spider):
    name = "sweetpea_and_willow"

    custom_settings = {
        # "FEED_FORMAT": "csv",
        # "FEED_URI": "malabar_furniture.csv",
        "LOG_FILE": "malabar_furniture_shopify.log",
    }

    headers = {
        "authority": "www.sweetpeaandwillow.com",
        "cache-control": "max-age=0",
        "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Yandex";v="22"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.141 YaBrowser/22.3.3.886 (beta) Yowser/2.5 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "sec-fetch-site": "same-origin",
        "sec-fetch-mode": "navigate",
        "sec-fetch-user": "?1",
        "sec-fetch-dest": "document",
        "accept-language": "en,ru;q=0.9",
    }

    cookies = {
        "amzn-checkout-session": "%7B%7D",
        "_fbp": "fb.1.1652394481944.1343184112",
        "_pin_unauth": "dWlkPU56VmhNak5rTUdVdE1EVmhaQzAwTkdabExXRm1PREF0TnpOak9XRXdOek5rTjJFeg",
        "_ga": "GA1.2.752968178.1652394485",
        "SPSI": "4eea709914a47dc1f5575f79dc373b51",
        "SPSE": "oc1iOVbm463lrWtCnix8S1Zlf9aGvPeKg7TG7d/WQXvAZjkksosjO/BSl80SLUWb/O8aqo3+lQSH9B1gMRWVdQ==",
        "PHPSESSID": "n6mfpugp82troila6hfib78q3k",
        "UTGv2": "h483379466221b95c6e78e9eb01940db0f64",
        "_hjSessionUser_2692700": "eyJpZCI6ImQ0MDU3M2YzLWM0YjItNTJjMS04YzNiLTM4NzcyMWI5MGY0MyIsImNyZWF0ZWQiOjE2NTIzOTQ0ODI4MTAsImV4aXN0aW5nIjp0cnVlfQ==",
        "_hjIncludedInSessionSample": "0",
        "_hjSession_2692700": "eyJpZCI6ImExOWI0YjI5LTcxODYtNGU5Ny05Y2UwLTVjYmFmODQ0MWZjYiIsImNyZWF0ZWQiOjE2NTI1OTk3NDU3MTAsImluU2FtcGxlIjpmYWxzZX0=",
        "_hjAbsoluteSessionInProgress": "0",
        "form_key": "LCm4cy48SHYhBX3C",
        "_gid": "GA1.2.1948251329.1652599747",
        "_gat": "1",
        "mage-cache-storage": "%7B%7D",
        "mage-cache-storage-section-invalidation": "%7B%7D",
        "mage-cache-sessid": "true",
        "recently_viewed_product": "%7B%7D",
        "recently_viewed_product_previous": "%7B%7D",
        "recently_compared_product": "%7B%7D",
        "recently_compared_product_previous": "%7B%7D",
        "product_data_storage": "%7B%7D",
        "section_data_ids": "%7B%22cart%22%3A1652599747%7D",
        "newsletter-popup-form": "declined",
        "spcsrf": "ef84c17476941fe30a45db5a0a4b8686",
        "sp_lit": "JHxME1OUKp+83P5XseqYpg==",
        "PRLST": "AH",
        "adOtr": "7ae049U19a4",
    }

    def start_requests(self):
        self.f_output = open("malabar_furniture_shopify.csv", "w", newline="")
        self.csv_output = csv.writer(self.f_output)
        
        self.csv_output.writerow(
            [
                "Handle",
                "Title",
                "Descritpion",
                "Price",
                "Delivery",
                "Color",
                "Dimensions",
                "Material",
                "Image_Src",
            ]
        )
        
        yield scrapy.Request(
            "https://www.sweetpeaandwillow.com/brands/emotional-brands/malabar?p=1",
            headers=self.headers,
            cookies=self.cookies,
            callback=self.parse_urls,
        )

    def parse_urls(self, response):
        url_list = response.css("div.item.product-item")

        for link in url_list:
            url = link.css("a::attr(href)").get()
            yield scrapy.Request(
                url=url,
                headers=self.headers,
                cookies=self.cookies,
                callback=self.parse_details,
            )

    def parse_details(self, response):
        data = []
        table = response.css("table.data.table.additional-attributes")
        for tr in table.css("tbody"):
            row = tr.css("tr")
            color = row[0].css("td::text").get()
            dimension = row[1].css("td::text").get()
            material = row[2].css("td::text").get()
            data.append(
                {
                    "Handle": response.css("h1.page-title ::text").get().lower(),
                    "Title": response.css("h1.page-title ::text").get(),
                    "Descritpion": response.css(
                        "div#description_product_show > p::text"
                    ).get(),
                    "Price": response.css("div.original-pricing-wrapper")
                    .css("span.price ::text")
                    .getall()[28],
                    "Delivery": response.css("p.availability-message > span::text").get(),
                    "Color": color,
                    "Dimensions": dimension,
                    "Material": material,
                    "Image_Src": response.css("div.MagicSlideshow")
                    .css("a img::attr(src)")
                    .getall(),
                }
            )

        for d in data:
            images = d["Image_Src"]
            self.csv_output.writerow(
                [
                    d["Handle"],
                    d["Title"],
                    d["Descritpion"],
                    d["Price"],
                    d["Delivery"],
                    d["Color"],
                    d["Dimensions"],
                    d["Material"],
                    images.pop(0) if images else None,
                ]
            )

            while images:
                self.csv_output.writerow(
                    [None, None, None, None, None, None, None, None, images.pop(0)]
                )

    def closed(self, spider):
        self.f_output.close()


if __name__ == "__main__":
    process = CrawlerProcess()
    process.crawl(SweetPeaAndWillowSpider)
    process.start()

The reason for duplicates is you were always appending to a global data list.

Sign up to request clarification or add additional context in comments.

7 Comments

I tried opening the file at the start and define the headers but no difference I tired appending to the file but it gives duplicated headers and entries. I have updated the code with your suggestion.
I've added a working example for you
Thank you so much. I tried your solution it dealt with duplicate headers but still getting duplicate entries on writing each new row. docs.google.com/spreadsheets/d/…
I am guessing that is an issue with how you are asking scrapy to trawl the page, perhaps it keeps giving results from the first page?
I think your issue is related to keep appending to a global data list. I have updated it to just use a local version
|

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.