1

I'd like to find all anchors from a page that are NOT nested in an italics tag. This is what I have, and it works, but the links are not processed in the correct order (as per the page source)

@$dom->loadHTML($this->html);
$xpath = new DOMXpath($dom);
$anchorlinks = $xpath->query('//a/@href[not(. = //i//a/@href)]');

Any advice as to how I should proceed to diff the two sets in the xpath query would be greatly appreciated.

Thanks.

$phil = gettingToPhilosophy("http://en.wikipedia.org/Yarn");
for($i=0; $i<30; $i++)
{
  $phil->hop();
  $phil->processHTML();
}

<?php
class gettingToPhilosophy
{
  public $base_url; //base_url to start with
  public $target_url; //url to hop to
  public $previous_link; //keep track of last link
  public $lookup; //cached array of visited links
  public $curl; //curl object to execute
  public $html; //html retrieved from curl request
  public $conn; //database connection resource
  public $hoplimit; //maximum number of hops (23 was the median as per the wikipedia article)
  public $hop_num; //the number of hops taken to reach the philosophy page  
  public $id; //id of current link (Primary Key)
  public $child_id; //id of next link  

  function __construct($base_url)
  {
    $this->base_url = filter_var($base_url, FILTER_VALIDATE_URL);

    //determine if url is valid
    if (!($this->base_url))
    {
      die("<font color='red'>Invalid URL</font>");
    }

    $this->target_url = parse_url($base_url, PHP_URL_PATH);
    $this->previous_link = '';
    $this->lookup = array();
    $this->curl = curl_init();

    // Create a user agent as to not get blocked by wikipedia
    $userAgent = 'Googlebot/2.1 (http://www.google.bot.com/bot.html)';

    // Initialize curl and following options
    //curl_setopt($this->curl, CURLOPT_USERAGENT, $userAgent);
    //curl_setopt($this->curl, CURLOPT_FAILONERROR, true);
    //curl_setopt($this->curl, CURLOPT_FOLLOWLOCATION, true);
    //curl_setopt($this->curl, CURLOPT_AUTOREFERER, true);
    //curl_setopt($this->curl, CURLOPT_RETURNTRANSFER,true);
    //curl_setopt($this->curl, CURLOPT_TIMEOUT, 10);

    //$this->conn = pg_connect("dbname=Wesley user=Wesley host=localhost") or die("Can't connect to database".pg_last_error());
    $this->hoplimit = 30;
    $this->hop_num = 0;
    $this->id = 1;
    $this->child_id = 0;
  }

  function __destruct()
  {
    $this->base_url = null;
    $this->target_url = null;
    $this->previous_link = null;
    $this->curl = null;
    $this->lookup = null;
    //pg_close($this-conn);
    $this->conn = null;
    $this->id = null;
    $this->child_id = null;
  }

  function hop()
  {
    //Error handling for cached results of links
    if (isset($this->lookup[$this->target_url]))
    {
      //printLinks();
      die("<font color='red'>Never ending loop: $this->target_url has already been seen</font>");
    }

    $this->lookup[$this->target_url] = 1; //cache the link

    $this->child_id++;
    $sql = "insert into Philosophy (base_url, childid, link) values('$this->base_url', $this->child_id, '$this->target_url')";
    //pg_execute($conn,$sql);
    echo "$sql <br/>";

    //append nodeValue to wikipedia url scheme
    $this->target_url = "http://en.wikipedia.org".$this->target_url;

    // Reset url
    $userAgent = 'Googlebot/2.1 (http://www.google.bot.com/bot.html)';    

    // Initialize curl and following options
    curl_setopt($this->curl, CURLOPT_USERAGENT, $userAgent);
    curl_setopt($this->curl, CURLOPT_FAILONERROR, true);
    curl_setopt($this->curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($this->curl, CURLOPT_AUTOREFERER, true);
    curl_setopt($this->curl, CURLOPT_RETURNTRANSFER,true);
    curl_setopt($this->curl, CURLOPT_TIMEOUT, 10);
    curl_setopt($this->curl, CURLOPT_URL,$this->target_url);

    // Get html from the page
    $this->html = curl_exec($this->curl);

    // Error handling for invalid link
    if(!$this->html)
    {
      //$this->printLinks();

      //target_url was invalid or not reachable
      die("<font color='red'>$this->target_url is invalid or unreachable - Hopped $this->hop_num times</font>");
    }
    $this->hop_num++;
  }

  function processHTML()
  {
    $dom = new DOMDocument();
    @$dom->loadHTML($this->html);

    $xpath = new DOMXpath($dom);
    $anchorlinks = $xpath->query('//a[not(ancestor::i)]/@href');

    //$anchorlinks = $dom->getElementsByTagName('a');
    echo "<pre>"; print_r(iterator_to_array($anchorlinks)); echo "</pre>"; exit;    

    foreach($anchorlinks as $anchorlink)
    {
      if (!$this->isValid($anchorlink->nodeValue)){ continue; }

      $this->previous_link = $this->target_url;
      $this->target_url = "$anchorlink->nodeValue";
      $flag = true;
      break;

      /*foreach($anchorlink->attributes as $attribute)
      {
        //skip erroneous links
        if ($attribute->nodeName !== 'href') {continue;}
        if (!$this->isValid($attribute->nodeValue)){ continue; }

        $this->target_url = "$attribute->nodeValue";
        $flag = true;
        break;
      }*/
    }
  }

  function isValid($link)
  {
    if ($link === $this->previous_link){ return false; }

    //links to ignore
    if (strstr($link, '#') || stristr($link, 'Help:') || stristr($link, 'navigation') || stristr($link,'[note')
     || strstr($link, '(') || stristr($link, 'File:') || strstr($link, '.jpg') || strstr($link, '?') || stristr($link, 'http')
     || strstr($link, '//') || stristr($link, 'Portal:') || stristr($link, 'Special:') || stristr($link, 'Wikipedia:')
     || stristr($link, 'Talk:') || stristr($link, 'Category:') || stristr($link, 'Main_Page'))
    {
      return false;
    }

    return true;
  }

  function printLinks()
  {
    $sql = "select childid, link from philosophy where base_url='$this->base_url'";
    if ($result = pg_execute($conn, $sql))
    {
      while ($row = pg_fetch_assoc($result))
      {
        echo "{$row['childid']}) {$row['link']} <br/>";
      }
    }
  }  
}
?>
0

1 Answer 1

2

I'd like to find all anchors from a page that are NOT nested in an italics tag

Then you should rather use

//a[not(ancestor::i)]/@href

which does exactly that, find all href attributes of a elements if they are not descendants of an i element.


The order of nodes in the result set may vary between implementations of XPath 1.0. Using a compliant XPath processor to apply the above XPath expression to http://en.wikipedia.org/wiki/Wikipedia:Getting_to_Philosophy results in (individual results separated by -----):

 href="#mw-head"
-----------------------
href="#p-search"
-----------------------
href="/wiki/File:Essay.svg"
-----------------------
href="/wiki/Wikipedia:Wikipedia_essays"
-----------------------
href="/wiki/Wikipedia:Policies_and_guidelines"
-----------------------
href="/wiki/Hyperlink"
-----------------------
href="/wiki/Wikipedia"
-----------------------
href="/wiki/Philosophy"
-----------------------
href="/wiki/Philosophy"
-----------------------
href="#cite_note-1"
-----------------------
href="/wiki/File:Crawl_on_Wikipedia_from_random_article_to_Philosophy..gif"
-----------------------
href="/wiki/File:Crawl_on_Wikipedia_from_random_article_to_Philosophy..gif"
-----------------------
href="/wiki/Document_classification"
-----------------------
href="/wiki/Wikipedia:MOSBEGIN"
-----------------------
href="/wiki/Mathematics"
-----------------------
href="/wiki/Science"
-----------------------
href="/wiki/Language"
-----------------------
href="/wiki/Philosophy"
-----------------------
href="#Method_summarized"
-----------------------
href="#Origins"
-----------------------
href="#Examples_of_exceptions_to_the_Getting_to_Philosophy_rule"
-----------------------
href="#See_also"
-----------------------
href="#References"
-----------------------
href="#External_links"
-----------------------
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit&amp;section=1"
-----------------------
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit&amp;section=2"
-----------------------
href="/wiki/Phenomenon"
-----------------------
href="/wiki/User:Mark_J"
-----------------------
href="#cite_note-2"
-----------------------
href="/wiki/Wikipedia:WikipediaWeekly/Episode50"
-----------------------
href="/wiki/Podcast"
-----------------------
href="#cite_note-3"
-----------------------
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit&amp;section=3"
-----------------------
href="/wiki/Yarn"
-----------------------
href="/wiki/Fibres"
-----------------------
href="/wiki/Rope"
-----------------------
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit&amp;section=4"
-----------------------
href="/wiki/Small-world_network"
-----------------------
href="/wiki/Attractor"
-----------------------
href="/wiki/Wikipedia:Wiki_Game"
-----------------------
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit&amp;section=5"
-----------------------
href="#cite_ref-1"
-----------------------
href="/wiki/User:Ilmari_Karonen/First_link"
-----------------------
href="/wiki/Help:CS1_errors#cite_web_url"
-----------------------
href="#cite_ref-2"
-----------------------
href="http://en.wikipedia.org/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;oldid=215744293"
-----------------------
href="#cite_ref-3"
-----------------------
href="http://huffduffer.com/psd/42471"
-----------------------
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit&amp;section=6"
-----------------------
href="http://www.xefer.com/wikipedia"
-----------------------
href="http://www.youtube.com/watch?v=vehDe2lSptU"
-----------------------
href="/wiki/Philosophy"
-----------------------
href="http://matpalm.com/blog/2011/08/13/wikipedia-philosophy/"
-----------------------
href="http://xkcd.com/903/"
-----------------------
href="/wiki/Xkcd"
-----------------------
href="/wiki/Tooltip"
-----------------------
href="http://wikiloopr.com/"
-----------------------
href="http://www.guardian.co.uk/technology/2011/jul/10/only-way-essex-wikipedia-philosophy"
-----------------------
href="/wiki/The_Guardian"
-----------------------
href="http://www.huffingtonpost.com/2011/11/14/wikipedia-philosophy_n_1093460.html"
-----------------------
href="http://en.wikipedia.org/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;oldid=645649870"
-----------------------
href="/wiki/Help:Category"
-----------------------
href="/wiki/Category:Wikipedia_essays"
-----------------------
href="/wiki/Category:Pages_using_web_citations_with_no_URL"
-----------------------
href="/w/index.php?title=Special:UserLogin&amp;returnto=Wikipedia:Getting+to+Philosophy&amp;type=signup"
-----------------------
href="/w/index.php?title=Special:UserLogin&amp;returnto=Wikipedia:Getting+to+Philosophy"
-----------------------
href="/wiki/Wikipedia:Getting_to_Philosophy"
-----------------------
href="/wiki/Wikipedia_talk:Getting_to_Philosophy"
-----------------------
href="#"
-----------------------
href="/wiki/Wikipedia:Getting_to_Philosophy"
-----------------------
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit"
-----------------------
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=history"
-----------------------
href="#"
-----------------------
href="/wiki/Main_Page"
-----------------------
href="/wiki/Main_Page"
-----------------------
href="/wiki/Portal:Contents"
-----------------------
href="/wiki/Portal:Featured_content"
-----------------------
href="/wiki/Portal:Current_events"
-----------------------
href="/wiki/Special:Random"
-----------------------
href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&amp;utm_medium=sidebar&amp;utm_campaign=C13_en.wikipedia.org&amp;uselang=en"
-----------------------
href="//shop.wikimedia.org"
-----------------------
href="/wiki/Help:Contents"
-----------------------
href="/wiki/Wikipedia:About"
-----------------------
href="/wiki/Wikipedia:Community_portal"
-----------------------
href="/wiki/Special:RecentChanges"
-----------------------
href="//en.wikipedia.org/wiki/Wikipedia:Contact_us"
-----------------------
href="/wiki/Special:WhatLinksHere/Wikipedia:Getting_to_Philosophy"
-----------------------
href="/wiki/Special:RecentChangesLinked/Wikipedia:Getting_to_Philosophy"
-----------------------
href="/wiki/Wikipedia:File_Upload_Wizard"
-----------------------
href="/wiki/Special:SpecialPages"
-----------------------
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;oldid=645649870"
-----------------------
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=info"
-----------------------
href="//www.wikidata.org/wiki/Q14605740"
-----------------------
href="/w/index.php?title=Special:Book&amp;bookcmd=book_creator&amp;referer=Wikipedia:Getting+to+Philosophy"
-----------------------
href="/w/index.php?title=Special:Book&amp;bookcmd=render_article&amp;arttitle=Wikipedia:Getting+to+Philosophy&amp;oldid=645649870&amp;writer=rdf2latex"
-----------------------
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;printable=yes"
-----------------------
href="//fr.wikipedia.org/wiki/Wikip&#xE9;dia:Se_rendre_&#xE0;_l'article_philosophie"
-----------------------
href="//uk.wikipedia.org/wiki/&#x412;&#x456;&#x43A;&#x456;&#x43F;&#x435;&#x434;&#x456;&#x44F;:&#x412;&#x441;&#x456;_&#x43F;&#x43E;&#x441;&#x438;&#x43B;&#x430;&#x43D;&#x43D;&#x44F;_&#x432;&#x435;&#x434;&#x443;&#x442;&#x44C;_&#x434;&#x43E;_&#x444;&#x456;&#x43B;&#x43E;&#x441;&#x43E;&#x444;&#x456;&#x457;"
-----------------------
href="#"
-----------------------
href="//www.wikidata.org/wiki/Q14605740#sitelinks-wikipedia"
-----------------------
href="//en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License"
-----------------------
href="//creativecommons.org/licenses/by-sa/3.0/"
-----------------------
href="//wikimediafoundation.org/wiki/Terms_of_Use"
-----------------------
href="//wikimediafoundation.org/wiki/Privacy_policy"
-----------------------
href="//www.wikimediafoundation.org/"
-----------------------
href="//wikimediafoundation.org/wiki/Privacy_policy"
-----------------------
href="/wiki/Wikipedia:About"
-----------------------
href="/wiki/Wikipedia:General_disclaimer"
-----------------------
href="//en.wikipedia.org/wiki/Wikipedia:Contact_us"
-----------------------
href="https://www.mediawiki.org/wiki/Special:MyLanguage/How_to_contribute"
-----------------------
href="//en.m.wikipedia.org/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;mobileaction=toggle_view_mobile"
-----------------------
href="//wikimediafoundation.org/"
-----------------------
href="//www.mediawiki.org/"
Sign up to request clarification or add additional context in comments.

6 Comments

Thanks!! en.wikipedia.org/wiki/Wikipedia:Getting_to_Philosophy I'm trying to solve this problem but, unfortunately, your xpath query does not process the links in the correct order either...any additional advice?
Does xpath process nodes in a different order?
@Wes You are welcome. Most likely, your Xpath engine only supports XPath 1.0. In version 1.0, nodes are defined as sets (let me also point you to a Wiki page), which means there is no particular order to them. But still, engines often return results in document order. You'd have to show all of your PHP code and also post the result you currently obtain, otherwise all bets are off.
$this->html = curl_exec($this->curl); //assume I set the curl up correctly $dom = new DOMDocument(); @$dom->loadHTML($this->html); $xpath = new DOMXpath($dom); $anchorlinks = $xpath->query('//a[not(ancestor::i)]/@href'); //echo "<pre>"; print_r(iterator_to_array($anchorlinks)); echo "</pre>"; if the target_url is en.wikipedia.org/wiki/Yarn the results I get are '/wiki/Fibre','/wiki/Natural_fiber', '/wiki/Fiber', '/wiki/Natural_fiber'..they should be '/wiki/Fibres', '/wiki/Rope'
@Wes I have added my result list to my answer. How does your result compare to this? Please never add code or other essential information in a comment - always edit your question.
|

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.