Calcetines Extreme

Calcetines Extreme
Take care of you using the best socks

Wednesday, May 13, 2015

Web Scraper

$request_url ='https://www.google.es/search?q=Barcelona';

// The Regular Expression filter
 $reg_exUrl = "/(http|https|ftp|ftps)\:\/\/[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(\/\S*)?/";

 function get_domain($url)
 {
  $pieces = parse_url($url);
  $domain = isset($pieces['host']) ? $pieces['host'] : '';
  if (preg_match('/(?P<domain>[a-z0-9][a-z0-9\-]{1,63}\.[a-z\.]{2,6})$/i', $domain, $regs)) {
   return $regs['domain'];
  }
  return false;
 }

 $ch = curl_init();
 curl_setopt($ch, CURLOPT_URL, $request_url); // The url to get links from

 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); // We want to get the respone

 $result = curl_exec($ch);

 $regex='|<a.*?href="(.*?)"|';
 preg_match_all($regex,$result,$parts);
 $title = preg_match('/title="(.+)">/', $html, $match);
 $links=$parts[1];
 asort($links);

 foreach($links as $link){
  $pos = strpos($link, '://');
  $exclude = strpos($link, 'google');  //remove google own results

  if ($pos!=0 && $exclude==0){
    $posini = strpos($link, 'http');
    $link = substr($link, $posini);
    echo "<a href='".$link."'>".get_domain($link)."</a> -> ".$link."<br>";
  }
 }

 curl_close($ch);


Sample:

Sources: