I'm trying to get all the links to the articles (which happen to have the class 'title may-blank' to denote them). I'm trying to figure out why the code below generates a whole bunch of "href=" when I run it, instead of returning with the actual URL. I also get a bunch of random text and links after the failed 25 article URLs (all 'href='), but not sure why that's happening since it should stop after it stop finding the class 'title may-blank'. Can you guys help me find out what's wrong?
import urllib2
def get_page(page):
response = urllib2.urlopen(page)
html = response.read()
p = str(html)
return p
def get_next_target(page):
start_link = page.find('title may-blank')
start_quote = page.find('"', start_link + 4)
end_quote = page.find ('"', start_quote + 1)
aurl = page[start_quote+1:end_quote] # Gets Article URL
return aurl, end_quote
def print_all_links(page):
while True:
aurl, endpos = get_next_target(page)
if aurl:
print("%s" % (aurl))
print("")
page = page[endpos:]
else:
break
reddit_url = 'http://www.reddit.com/r/worldnews'
print_all_links(get_page(reddit_url))