domain=$domain.'/'; //extracting links $html=$this->get_content($url); $reg = "#]+).*>([^<>]+)#i"; $html=str_replace('','',$html); //getting strings $data=$this->my_preg_match_all('',$html); //now extracting urls $links=array(); //here will be result $hrefs=array(); foreach($data as $k=>$v) { if (preg_match_all($reg, $v, $matches)) { //print_r($matches); $link_url=$matches[2][0]; $link_anchor=$matches[3][0]; //echo "\n".$v.'=>'; $temp_url=$this->create_full_link($link_url,$this->domain,$url); if($temp_url!==false) { //extracting title of link $title=$link_anchor; /*////cleaning links # and / if(strpos($temp_url,'#')!==false) $temp_url=substr($temp_url,0,strpos($temp_url,'#')); $temp_url=rtrim($temp_url,'/'); */ //is this internal or external link? $domain=parse_url($url); if(substr($domain['host'],0,4)=='www.') $domain=substr($domain['host'],4,strlen($domain['host'])-4); else $domain=$domain['host']; $link_domain=parse_url($temp_url); if(substr($link_domain['host'],0,4)=='www.') $link_domain=substr($link_domain['host'],4,strlen($link_domain['host'])-4) ; else $link_domain=$link_domain['host']; if(!in_array($temp_url,$hrefs)) { if($domain==$link_domain) $links['internal'][]=array($temp_url,$title); else $links['external'][]=array($temp_url,$title); } $hrefs[]=$temp_url; } } } return $links; } function get_content($url) { $url=trim($url); $ch = curl_init(); curl_setopt ($ch, CURLOPT_URL, $url); curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT)"); curl_setopt ($ch, CURLOPT_HEADER,0); curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt ($ch, CURLOPT_TIMEOUT, 120); $res = curl_exec ($ch); curl_close($ch); //echo $res; $res=str_replace("\r"," ",$res); $res=str_replace("\n"," ",$res); $res=str_replace("<"," <",$res); $res=str_replace(">","> ",$res); return($res); } function create_full_link($link,$domain,$url) { if($link===false) return false; $tmp=parse_url($link); if(substr($link,0,strlen($tmp['scheme'])+3)==$tmp['scheme'].'://') return $link; if(substr($link,0,1)=='/') return $domain.ltrim($link,'/'); //getting root of url $tmp=parse_url($url); $path=$tmp['path']; $root=$tmp['scheme'].'://'.$tmp['host']; if(strlen(ltrim($path,'/'))!=0) { if(strrpos($path,'/')!==false && strrpos($path,'.')!==false) { $point=strrpos($path,'.'); $slash=strrpos($path,'/'); if($point>$slash) $root.=substr($path,0,$slash); else $root.=$path; } } else $root=$domain; return $root.'/'.$link; } function my_preg_match_all($start,$end,$string) { $res=array(); while(strpos($string,$start)!==FALSE && strpos($string,$end)!==FALSE) { $first=strpos($string,$start); $string=substr($string,$first); $last=strpos($string,$end); $tmp=substr($string,0,$last+strlen($end)); if(strlen($tmp)>0) $res[]=$tmp; $length=$last+strlen($end); $string=substr($string,$length); } return $res; } } ?>