* @copyright Copyright © 2006 Peter Adams * @license http://www.gnu.org/copyleft/gpl.html GPL v2.0 * @category owa * @package owa * @version $Revision$ * @since owa 1.0.0 */ class owa_http { /** * Configuration * * @var array */ var $config; /** * Error handler * * @var object */ var $e; /** * The length of text contained in the snippet * * @var string */ var $snip_len = 100; /** * The string that is added to the beginning and * end of snippet text. * * @var string */ var $snip_str = '...'; /** * Anchor information for a particular link * * @var array */ var $anchor_info; var $crawler; var $testcrawler; var $http; var $response; var $response_headers; var $response_code; var $request_headers; function __construct() { $c = &owa_coreAPI::configSingleton(); $this->config = $c->fetch('base'); $this->e = &owa_coreAPI::errorSingleton(); $this->crawler = new Snoopy; // do not allow snoopy to follow links $this->crawler->maxredirs = 5; $this->crawler->agent = owa_coreAPI::getSetting('base', 'owa_user_agent'); //$this->crawler->agent = "Firefox"; //owa_coreAPI::debug('hello from owa_http constructor'); return; } function fetch($uri) { //owa_coreAPI::debug('hello from owa_http fetch'); return $this->crawler->fetch($uri); } function testFetch($url) { $http= new http_class; owa_coreAPI::debug('hello owa_http testfetch method'); /* Connection timeout */ $http->timeout=0; /* Data transfer timeout */ $http->data_timeout=0; /* Output debugging information about the progress of the connection */ $http->debug=1; $http->user_agent = owa_coreAPI::getSetting('base', 'owa_user_agent'); $http->follow_redirect=1; $http->redirection_limit=5; $http->exclude_address=""; $http->prefer_curl=0; $arguments = array(); $error=$http->GetRequestArguments($url,$arguments); $error=$http->Open($arguments); //for(;;) // { $error=$http->ReadReplyBody($body,50000); if($error!="" || strlen($body)==0) owa_coreAPI::debug(HtmlSpecialChars($body)); // } } /** * Searches a fetched html document for the anchor of a specific url * * @param string $link */ function extract_anchor($link) { $matches = ''; $regex = '/]*href=\"%s\"[^>]*>(.*?)<\/a>/i'; //$escaped_link = str_replace(array("/", "?"), array("\/", "\?"), $link); $pattern = trim(sprintf($regex, preg_quote($link, '/'))); $search = preg_match($pattern, $this->response, $matches); //$this->e->debug('pattern: '.$pattern); //$this->e->debug('link: '.$link); if (empty($matches)) { if (substr($link, -1) === '/') { $link = substr($link, 0, -1); $pattern = trim(sprintf($regex, preg_quote($link, '/'))); $search = preg_match($pattern, $this->response, $matches); //$this->e->debug('pattern: '.$pattern); //$this->e->debug('link: '.$link); } } $this->e->debug('ref search: '.$search); //$this->e->debug('ref matches: '.print_r($this->results, true)); //$this->e->debug('ref matches: '.print_r($matches, true)); if (isset($matches[0])) { $this->anchor_info = array('anchor_tag' => $matches[0], 'anchor_text' => owa_lib::inputFilter($matches[0])); $this->e->debug('Anchor info: '.print_r($this->anchor_info, true)); } } /** * Creates a text snippet of the portion of page where the * specific link is found. * * Takes fully qualified URL for the link to search for. * * @param string $link * @return string */ function extract_anchor_snippet($link){ // Search the page for a specific anchor $this->extract_anchor($link); if(!empty($this->anchor_info['anchor_tag'])) { // drop certain HTML entitities and their content $nohtml = $this->strip_selected_tags( $this->response, array('title', 'head', 'script', 'object', 'style', 'meta', 'link', 'rdf:'), true); //$this->e->debug('Refering page content after certain html entities were dropped: '.$this->results); // calc len of the anchor text $atext_len = strlen($this->anchor_info['anchor_tag']); // find position within document of the anchor text $start = strpos($nohtml, $this->anchor_info['anchor_tag']); if ($start < $this->snip_len) { $part1_start_pos = 0; $part1_snip_len = $start; } else { $part1_start_pos = $start; $part1_snip_len = $this->snip_len; } $replace_items = array("\r\n", "\n\n", "\t", "\r", "\n"); // Create first segment of snippet $first_part = substr($nohtml, 0, $part1_start_pos); $first_part = str_replace($replace_items, '', $first_part); $first_part = strip_tags(owa_lib::inputFilter($first_part)); //$part1 = trim(substr($nohtml, $part1_start_pos, $part1_snip_len)); $part1 = substr($first_part,-$part1_snip_len, $part1_snip_len); //$part1 = str_replace(array('\r\n', '\n\n', '\t', '\r', '\n'), '', $part1); //$part1 = owa_lib::inputFilter($part1); // Create second segment of snippet $part2 = trim(substr($nohtml, $start + $atext_len, $this->snip_len+300)); $part2 = str_replace($replace_items, '', $part2); $part2 = substr(strip_tags(owa_lib::inputFilter($part2)),0, $this->snip_len); // Put humpty dumpy back together again and create actual snippet $snippet = $this->snip_str.$part1.' '.owa_lib::inputFilter($this->anchor_info['anchor_tag']).' '.$part2.$this->snip_str; } else { $snippet = ''; } return $snippet; } function extract_title() { preg_match('~(||(\s*(.*?)\s*))~i', $this->response, $m); $this->e->debug("referer title extract: ". print_r($m, true)); return $m[3]; } function strip_selected_tags($str, $tags = array(), $stripContent = false) { foreach ($tags as $k => $tag){ if ($stripContent == true) { $pattern = sprintf('#(<%s.*?>)(.*?)(<\/%s.*?>)#is', preg_quote($tag), preg_quote($tag)); $str = preg_replace($pattern,"",$str); } $str = preg_replace($pattern, ${2},$str); } return $str; } function SetupHTTP() { if(!IsSet($this->http)) { $this->http = new http_class; $this->http->follow_redirect = 1; $this->http->debug = 0; $this->http->debug_response_body = 0; $this->http->html_debug = 1; $this->http->user_agent = owa_coreAPI::getSetting('base', 'owa_user_agent'); $this->http->timeout = 3; $this->http->data_timeout = 3; } } function OpenRequest($arguments, &$headers) { if(strlen($this->error=$this->http->Open($arguments))) return(0); if(strlen($this->error=$this->http->SendRequest($arguments)) || strlen($this->error=$this->http->ReadReplyHeaders($headers))) { $this->http->Close(); return(0); } if($this->http->response_status!=200) { $this->error = 'the HTTP request returned the status '.$this->http->response_status; $this->http->Close(); return(0); } return(1); } function GetRequestResponse(&$response) { for($response = ''; ; ) { if(strlen($this->error=$this->http->ReadReplyBody($body, 500000))) { $this->http->Close(); return(0); } if(strlen($body)==0) break; $response .= $body; } $this->http->Close(); owa_coreAPI::debug('http response code: '.$this->http->response_status); return($response); } function getRequest($url, $arguments = '', $response = '') { $this->SetupHTTP(); $this->http->GetRequestArguments($url, $arguments); $arguments['RequestMethod']='GET'; if(!$this->OpenRequest($arguments, $headers)) { return(0); } $this->response = $this->GetRequestResponse($response); return($this->response); } } ?>