source_html = str_replace(array("\n", "\r", "\t")," ", $source_html); } //@$boundary_pattern: pattern to find a smaller boundary-block within source_html so that our content does not wander around /** * @param $pattern_arr_or_str array|string ['pattern1', 'pattern2'] or 'pattern' * @param bool $match_once * @param bool $find_image * @param string $boundary_pattern * @return array|bool */ public function extract($pattern_arr_or_str, $match_once = true, $find_image = false, $boundary_pattern = "") { if(is_array($pattern_arr_or_str)) { foreach ($pattern_arr_or_str as $pattern){ $result = $this->extractSinglePattern($pattern, $match_once, $find_image, $boundary_pattern); if(is_array($result) && isset($result['result'])) { return $result; } } return false; } // default is string return $this->extractSinglePattern($pattern_arr_or_str, $match_once, $find_image, $boundary_pattern); } /** * @param $pattern string * @param bool $match_once * @param bool $find_image * @param string $boundary_pattern * @return array|bool */ protected function extractSinglePattern($pattern, $match_once = true, $find_image = false, $boundary_pattern = "") { $elements = $this->getCodeElement($pattern); if(!$elements["code"]) return false; $match = array(); $source_html = $this->source_html; //found boundary if pattern exist if($boundary_pattern && preg_match("@".$boundary_pattern."@i", $source_html, $match)) { $source_html = $match[1]; } if($match_once) { if(preg_match("@".$elements["code"]."@i", $source_html, $match)){ //echo $match[1]; return array( "result" => $this->cleanHtmlBlock($match[1], $elements), "images" => ($find_image) ? $this->extractImages($match[1]) : null ); } } else { $results = array(); if(preg_match_all("@".$elements["code"]."@", $source_html, $match)){ foreach ($match[1] as $html_block ) { $results[] = array( "result" => $this->cleanHtmlBlock($html_block, $elements), "images" => ($find_image) ? $this->extractImages($html_block) : null ); } } return $results; } return false; } private function cleanHtmlBlock($html_block, $elements) { if($elements["removed"]){ $arrayRemover = array_filter(explode(";",$elements["removed"])); foreach($arrayRemover as $char_removed){ $char_removed = stripslashes(trim($char_removed)); $html_block = str_replace($char_removed.";","", $html_block); //sometimes   $html_block = str_replace($char_removed,"", $html_block); $html_block = preg_replace("{".addslashes($char_removed)."}","", $html_block); } } if($elements["sepa"]){ $html_block = strrchr($html_block, $elements["sepa"]); $html_block = str_replace($elements["sepa"],"", $html_block); } if($elements["extra_url"] && $html_block) $html_block = $elements["extra_url"] . trim($html_block); //thay the cum tu source bang cum tu tuy chon if($elements["invalid"]){ $arrayReplace = array_filter(explode(";", $elements["invalid"])); foreach($arrayReplace as $replace_group){ $replace_group = stripslashes($replace_group); //echo $replace_group; $replace_group_a = explode("#",$replace_group); $html_block = str_replace(trim($replace_group_a[0]),trim($replace_group_a[1]), $html_block); $html_block = preg_replace("{".addslashes(trim($replace_group_a[0]))."}",trim($replace_group_a[1]), $html_block); } } return trim($html_block); } private function extractImages($source_html){ $img_match = array(); if(preg_match_all("/img(.*?)?src\s*=\s*\\\\?[\'\"]?([+:%\/\?~=&;\(\),|!._a-zA-Z0-9-]*)[\'\"]?/i", $source_html, $img_match)){ return array_unique($img_match[2]); } return array(); } //$code_line = // singlePro | chandau(.*?)chancuoi | remove_word'];word2 | sepa | prepend_after | invalid_list // pagePro | chandau(.*?)chancuoi | remove_word'];word2 | sepa | prepend_after | invalid_list /** * @param $code_line string * @return array|false */ private function getCodeElement($code_line){ if(!$code_line) return false; $element = explode("|", trim($code_line)); $result = array(); if(array_key_exists(0, $element)){ $result["type"] = trim($element[0]); //pagePro, singlePro }else $result["type"] = ""; if(array_key_exists(1, $element)){ $result["code"] = trim($element[1]); }else $result["code"] = ""; if(array_key_exists(2,$element)){ $result["removed"] = trim($element[2]); }else $result["removed"] = ""; if(array_key_exists(3,$element)){ $result["sepa"] = trim($element[3]); }else $result["sepa"] = ""; if(array_key_exists(4,$element)){ $result["extra_url"] = trim($element[4]); }else $result["extra_url"] = ""; if(array_key_exists(5,$element)){ $result["invalid"] = trim($element[5]); //for images }else $result["invalid"] = ""; return $result; } private function buildFullUrl($url, $base_url){ if(!$base_url) return $url; if(strlen($url) < 2) return ""; if(preg_match("/(http|www.|javascript|mailto|ymsgr)/i",$url)){ return $url; }else{ return $this->convert_to_absolute( $base_url, $url ); } } private function convert_to_absolute($absolute, $relative) { $p = parse_url($relative); $first_letter = $relative[0]; $last_letter = substr($relative, strlen($relative) -1, 1); if(array_key_exists("scheme",$p) || strpos($relative,"www.") !== false || substr($relative, 0, 2) == '//') return $relative; //it's absolute if(in_array($first_letter,array('?',';'))) return str_replace(strrchr($absolute,$first_letter),"",$absolute) . $relative; if($first_letter == "#") return $absolute;//already crawled this page extract(parse_url($absolute)); $path = (isset($path)) ? $path : ""; $path = (strrchr($absolute,"/")!="/") ? ((dirname($path) != "\\") ? dirname($path) : "") : $path; if($first_letter == '/') { $cparts = array_filter(explode("/", $relative)); } else { $aparts = array_filter(explode("/", $path)); //print_r($aparts); $rparts = array_filter(explode("/", $relative)); //print_r($rparts); $cparts = array_merge($aparts, $rparts); //print_r($cparts); if(!preg_match("/[a-z0-9]/i",$first_letter)){ foreach($cparts as $i => $part) { if($part == '.') { $cparts[$i] = null; } if($part == '..') { $cparts[$i] = ''; if(array_key_exists($i - 1,$cparts)){ if($cparts[$i - 1] != null) $cparts[$i - 1] = null; else if(array_key_exists($i - 3,$cparts)) $cparts[$i - 3] = null; // in case ../../ } } } } $cparts = array_filter($cparts); } $path = implode("/", $cparts); if($last_letter == '/') $path .= "/"; $url = ""; if($scheme) { $url = "$scheme://"; } if($host) { $url .= "$host/"; } $url .= $path; return $url; } }