0

I'm currently scraping websites for multiple keywords in their source code using python 2.7. I would like to allocate and export these keywords to individual columns in an exported CSV file like this:

enter image description here

However, with my code I'm getting this:

enter image description here

My code:

import urllib2
import csv

fieldnames = ['Website', 'Sitemap', 'Viewport', '@media']

def csv_writerheader(path):
    with open(path, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

def csv_writer(domainname,Sitemap, path):
    with open(path, 'a') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        # writer.writeheader()
        writer.writerow({'Website': domainname, 'Sitemap': Sitemap})

csv_output_file = 'exported_print_results.csv'
keyword1 = ['sitemap']
keyword2 = ['viewport']
keyword3 = ['@media']

csv_writerheader(csv_output_file)

f = open('top1m-edited.csv')
csv_f = csv.reader(f)
for line in f:
    strdomain = line.strip()
    if '.nl' in strdomain:
        try:
            req = urllib2.Request(strdomain.strip())
            response = urllib2.urlopen(req)
            html_content = response.read()

            # keyword 1
            for searchstring in keyword1:
                if searchstring.lower() in str(html_content).lower():
                    print (strdomain, keyword1, 'found')
                    csv_writer(strdomain, 'found', csv_output_file)

                else:
                    print (strdomain, keyword1, 'not found')
                    csv_writer(strdomain, 'not found', csv_output_file)

            # keyword 2
            for searchstring in keyword2:
                if searchstring.lower() in str(html_content).lower():
                    print (strdomain, keyword2, 'found')
                    csv_writer(strdomain, 'found', csv_output_file)

                else:
                    print (strdomain, keyword2, 'not found')
                    csv_writer(strdomain, 'not found', csv_output_file)

            # keyword 3
            for searchstring in keyword3:
                if searchstring.lower() in str(html_content).lower():
                    print (strdomain, keyword3, 'found')
                    csv_writer(strdomain, 'found', csv_output_file)

                else:
                    print (strdomain, keyword3, 'not found')
                    csv_writer(strdomain, 'not found', csv_output_file)

        except urllib2.HTTPError:
            print (strdomain, 'HTTP ERROR')

        except urllib2.URLError:
            print (strdomain, 'URL ERROR')

        except urllib2.socket.error:
            print (strdomain, 'SOCKET ERROR')

        except urllib2.ssl.CertificateError:
            print (strdomain, 'SSL Certificate ERROR')
f.close()

How should I edit my code to make this work?

2 Answers 2

1

Consider using a dictionary to store the found and not found values conditionally by keyword and pass that into your csv write method. But before that one of your issues is not specifying the lineterminator in csv.writer() which tends to be needed on Window text files. And try iterating across a list of keywords in one loop routine.

fieldnames = ['Website', 'Sitemap', 'Viewport', '@media']

def csv_writerheader(path):
    with open(path, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, lineterminator='\n')
        writer.writeheader()

def csv_writer(dictdata, path):
    with open(path, 'a') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, lineterminator='\n')
        writer.writerow(dictdata)

csv_output_file = 'exported_print_results.csv'
# LIST OF KEY WORDS (TITLE CASE TO MATCH FIELD NAMES)
keywords = ['Sitemap', 'Viewport', '@media']

csv_writerheader(csv_output_file)

with open('top1m-edited.csv', 'r') as f:
    csv_f = csv.reader(f, lineterminator='\n')
    for line in f:
        strdomain = line.strip()
        # INITIALIZE DICT
        data = {'Website': strdomain}

        if '.nl' in strdomain:        
            try:
                req = urllib2.Request(strdomain.strip())
                response = urllib2.urlopen(req)
                html_content = response.read()

                # ITERATE THROUGH EACH KEY AND UPDATE DICT
                for searchstring in keywords:
                    if searchstring.lower() in str(html_content).lower():
                        print (strdomain, searchstring, 'found')
                        data[searchstring] = 'found'    
                    else:
                        print (strdomain, searchstring, 'not found')
                        data[searchstring] = 'not found'

                # CALL METHOD PASSING DICT AND OUTPUT FILE
                csv_writer(data, csv_output_file)

                except urllib.HTTPError:
                    print (strdomain, 'HTTP ERROR')

                except urllib.URLError:
                    print (strdomain, 'URL ERROR')

                except urllib.socket.error:
                    print (strdomain, 'SOCKET ERROR')

                except urllib.ssl.CertificateError:
                    print (strdomain, 'SSL Certificate ERROR')

CSV Output

Website                 Sitemap     Viewport    @media
http://www.google.nl    not found   not found   found
http://www.youtube.nl   not found   found       not found
http://www.facebook.nl  not found   found       not found
Sign up to request clarification or add additional context in comments.

Comments

0

The default separator in your spreadsheet appears not to be comma. Most likely it is a TAB. You could either change the separator to comma when importing (there is typically an import dialog which allows you to select it) or output from Python with TAB as the field separator.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.