grypto just another tech blog

26Jun/092

Create a PHP web crawler or scraper in 5 minutes using php-CURL, and php-DOM

This post is in response to the articles floating around the internet named, for the most part, "Create a PHP web crawler or scraper in 5 minutes." Here we use PHP DOM and some more complex methods to create "infinitely extendable web crawler in under 5 minutes." Actually it depends on how fast you can read and copy the code.

The Framework

We start with a blank DOM_Crawler class:

class DOM_Crawler
{
 
}

Next, we'll create the methods to fetch, prepare, load into DOM, and collect links.

class DOM_Crawler
{
	protected $markup = '';
 
	protected $dom = null;
 
	protected $base_url = null;
 
	public function __construct ($url)
	{
 
	}
 
	protected function _load_markup ($url)
	{
 
	}
 
	protected function _prepare_markup ($content, $encod='')
	{
 
	}
 
	protected function _init_dom ()
	{
 
	}
 
	public function get ($type)
	{
 
	}
 
	protected function _get_links ()
	{
 
	}
 
	protected function _base_url ($url)
	{
 
	}
}

Fetching Content

As in the other examples, the constructor will accept a URL then pass it on to the method "_load_markup()." Php-curl is used here instead of file_get_contents() as file_get_contents() is deprecated in favor of using the CURL libraries. "_prepare_markup()" is then used to fix encoding issues before loading into Php-DOM. "_prepare_markup()" is explained in the comments of PHP: DOMDocument::loadHTML - Manual. "init_dom()" will initialize Php-DOM.

	public function __construct ($url)
	{
		$this->base_url = $this->_base_url($url);
		$this->markup = $this->_load_markup($url);
		$this->markup = $this->_prepare_markup($this->markup);
		$this->_init_dom();
	}
 
	protected function _load_markup ($url)
	{
		$ch = curl_init();
		$timeout = 10;
		curl_setopt ($ch, CURLOPT_URL, $url);
		curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
		curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
		$contents = curl_exec($ch);
		curl_close($ch);
		return $contents;
	}
 
	protected function _prepare_markup ($content, $encod='')
	{
		mb_detect_order("ASCII,UTF-8,ISO-8859-1,windows-1252,iso-8859-15");
		if (!empty($content))
		{
			if (empty($encod))
			{
				$encod = mb_detect_encoding($content);
			}
			$headpos = mb_strpos($content,'<head>');
			if ($headpos === false)
			{
				$headpos = mb_strpos($content,'<HEAD>');
			}
			if ($headpos !== false) {
					$headpos += 6;
					$content = mb_substr($content, 0, $headpos) 
. '<meta http-equiv="Content-Type" content="text/html; charset=' . $encod . '">' 
. mb_substr($content, $headpos);
			}
			$content = mb_convert_encoding($content, 'HTML-ENTITIES', $encod);
			return $content;
                }
	}
 
	protected function _init_dom ()
	{
		$this->dom = new DomDocument;
		$this->dom->loadHTML($this->markup);
		$this->dom->normalizeDocument();
	}

Crawling the Content for Data

From the other examples: "Our get() method will accept a $type string which essentially will simply be used to invoke another method actually doing the processing. As you can see below we construct the method name as a string, then make sure it is available so now developers can utilize this simply by invoking $crawl->get('images');."

The "_get_links()" will start by fetching all tags named 'a', then it uses DOMElement::getAttribute to return the actual URL from the "href" attribute. If any link lacks the "base" or "scheme, user, password, host, port," we'll fix it by adding $this->base_url. '#' links are simply not included in the array.

	public function get ($type)
	{
		$method = "_get_{$type}";
		if (method_exists($this, $method))
		{
			return call_user_method($method, $this);
		}
	}
 
	protected function _get_links ()
	{
		if (!empty($this->markup))
		{
			$anchors = $this->dom->getElementsByTagName('a');
			foreach ($anchors as $anchor)
			{
				$href = $anchor->getAttribute('href');
				if (mb_substr($href, 0, 1) == '/')
				{
					$links[] = $this->base_url . $href;
				}
				else if (mb_substr($href, 0, 1) != '#')
				{
					$links[] = $href;
				}
			}
			return $links;
		}
		return false;
	}
 
	protected function _base_url ($url)
	{
		$parsed_url = parse_url($url);
		$base_url = $parsed_url['scheme'] . '://';
		if (!empty($parsed_url['user']))
		{
			$base_url .= $parsed_url['user'];
			if (!empty($parsed_url['pass']))
			{
				$base_url .= ':' . $parsed_url['pass'];
			}
			$base_url .= '@';
		}
		$base_url .= $parsed_url['host'];
		if (!empty($parsed_url['port']))
		{
			$base_url .=  ':' . $parsed_url['port'];
		}
		return $base_url;
	}

Final Web Crawler Code

class DOM_Crawler {
 
	protected $markup = '';
 
	protected $dom = null;
 
	protected $base_url = null;
 
	public function __construct ($url)
	{
		$this->base_url = $this->_base_url($url);
		$this->markup = $this->_load_markup($url);
		$this->markup = $this->_prepare_markup($this->markup);
		$this->_init_dom();
	}
 
	protected function _load_markup ($url)
	{
		$ch = curl_init();
		$timeout = 10;
		curl_setopt ($ch, CURLOPT_URL, $url);
		curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
		curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
		$contents = curl_exec($ch);
		curl_close($ch);
		return $contents;
	}
 
	protected function _prepare_markup ($content, $encod='')
	{
		mb_detect_order("ASCII,UTF-8,ISO-8859-1,windows-1252,iso-8859-15");
		if (!empty($content))
		{
			if (empty($encod))
			{
				$encod = mb_detect_encoding($content);
			}
			$headpos = mb_strpos($content,'<head>');
			if ($headpos === false)
			{
				$headpos = mb_strpos($content,'<HEAD>');
			}
			if ($headpos !== false) {
					$headpos += 6;
					$content = mb_substr($content, 0, $headpos) 
. '<meta http-equiv="Content-Type" content="text/html; charset=' . $encod . '">' 
. mb_substr($content, $headpos);
			}
			$content = mb_convert_encoding($content, 'HTML-ENTITIES', $encod);
			return $content;
        }
	}
 
	protected function _init_dom ()
	{
		$this->dom = new DomDocument;
		$this->dom->loadHTML($this->markup);
		$this->dom->normalizeDocument();
	}
 
	public function get ($type)
	{
		$method = "_get_{$type}";
		if (method_exists($this, $method))
		{
			return call_user_method($method, $this);
		}
	}
 
	protected function _get_links ()
	{
		if (!empty($this->markup))
		{
			$anchors = $this->dom->getElementsByTagName('a');
			foreach ($anchors as $anchor)
			{
				$href = $anchor->getAttribute('href');
				if (mb_substr($href, 0, 1) == '/')
				{
					$links[] = $this->base_url . $href;
				}
				else if (mb_substr($href, 0, 1) != '#')
				{
					$links[] = $href;
				}
			}
			return $links;
		}
		return false;
	}
 
	protected function _base_url ($url)
	{
		$parsed_url = parse_url($url);
		$base_url = $parsed_url['scheme'] . '://';
		if (!empty($parsed_url['user']))
		{
			$base_url .= $parsed_url['user'];
			if (!empty($parsed_url['pass']))
			{
				$base_url .= ':' . $parsed_url['pass'];
			}
			$base_url .= '@';
		}
		$base_url .= $parsed_url['host'];
		if (!empty($parsed_url['port']))
		{
			$base_url .=  ':' . $parsed_url['port'];
		}
		return $base_url;
	}
}

Usage

$links[0] = 'http://www.reddit.com/';
for ($i = 0; count($links) < 100; $i++)
{
	$crawler = new DOM_Crawler($links[$i]);
	$links = _merge_unique($links, $crawler->get('links'));
}
print_r($links);
 
function _merge_unique ($arr, $arr2)
{
	foreach ($arr2 as $key => $value)
	{
		if (!in_array($value, $arr))
		{
			$arr[] = $value;
		}
	}
	return $arr;
}

Returns:

Array
(
    [0] => http://www.reddit.com/
    [1] => http://www.reddit.com/r/funny/
    [2] => http://www.reddit.com/r/gaming/
    [3] => http://www.reddit.com/r/pics/
    [4] => http://www.reddit.com/r/politics/
    [5] => http://www.reddit.com/r/programming/
    [6] => http://www.reddit.com/r/reddit.com/
    [7] => http://www.reddit.com/r/science/
    [8] => http://www.reddit.com/r/technology/
    [9] => http://www.reddit.com/r/worldnews/
    [10] => http://www.reddit.com/r/WTF/
    .....
    [201] => http://www.reddit.com/help/faq
    [202] => http://www.reddit.com/help/reddiquette
    [203] => http://www.reddit.com/bookmarklets/
    [204] => http://www.reddit.com/buttons/
    [205] => http://www.reddit.com/code/
    [206] => http://www.reddit.com/socialite/
    [207] => http://www.reddit.com/widget/
    [208] => http://www.reddit.com/iphone/
    [209] => http://www.reddit.com/blog/
    [210] => http://justin.tv/reddit
    [211] => http://www.reddit.com/ad_inq/
    [212] => http://www.reddit.tv
    [213] => http://www.redditall.com
    [214] => http://www.baconbuzz.com
    [215] => http://reddit.destructoid.com
    [216] => http://www.thecutelist.com
    [217] => http://reddit.independent.co.uk
    [218] => http://www.redditgadgetguide.com
    [219] => http://www.weheartgossip.com
    [220] => http://www.idealistnews.com
    [221] => http://www.wired.com
    [222] => http://www.arstechnica.com
    [223] => http://www.style.com
    [224] => http://www.epicurious.com
    [225] => http://www.concierge.com
    [226] => http://reddit.com/help/useragreement
    [227] => http://reddit.com/help/privacypolicy
    [228] => http://www.reddit.com/feedback
)
Comments (2) Trackbacks (0)
  1. This code works perfect. How do I get the anchor text in addition to the url? I will appreciate some feedback. Thanks again for sharing such a useful piece of code.


Leave a comment


No trackbacks yet.