246 lines
8.3 KiB
PHP
246 lines
8.3 KiB
PHP
<?php
|
|
/**
|
|
* Created by PhpStorm.
|
|
* Date: 11/29/17
|
|
* Time: 10:48 PM
|
|
*/
|
|
|
|
namespace Hura8\System;
|
|
|
|
class HtmlParser
|
|
{
|
|
private $source_html = '';
|
|
|
|
public function __construct() {
|
|
|
|
}
|
|
|
|
public function setSource($source_html){
|
|
$this->source_html = str_replace(array("\n", "\r", "\t")," ", $source_html);
|
|
}
|
|
|
|
//@$boundary_pattern: pattern to find a smaller boundary-block within source_html so that our content does not wander around
|
|
/**
|
|
* @param $pattern_arr_or_str array|string ['pattern1', 'pattern2'] or 'pattern'
|
|
* @param bool $match_once
|
|
* @param bool $find_image
|
|
* @param string $boundary_pattern
|
|
* @return array|bool
|
|
*/
|
|
public function extract($pattern_arr_or_str, $match_once = true, $find_image = false, $boundary_pattern = "") {
|
|
if(is_array($pattern_arr_or_str)) {
|
|
foreach ($pattern_arr_or_str as $pattern){
|
|
$result = $this->extractSinglePattern($pattern, $match_once, $find_image, $boundary_pattern);
|
|
if(is_array($result) && isset($result['result'])) {
|
|
return $result;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// default is string
|
|
return $this->extractSinglePattern($pattern_arr_or_str, $match_once, $find_image, $boundary_pattern);
|
|
}
|
|
|
|
/**
|
|
* @param $pattern string
|
|
* @param bool $match_once
|
|
* @param bool $find_image
|
|
* @param string $boundary_pattern
|
|
* @return array|bool
|
|
*/
|
|
protected function extractSinglePattern($pattern, $match_once = true, $find_image = false, $boundary_pattern = "") {
|
|
|
|
$elements = $this->getCodeElement($pattern);
|
|
if(!$elements["code"]) return false;
|
|
|
|
$match = array();
|
|
$source_html = $this->source_html;
|
|
//found boundary if pattern exist
|
|
if($boundary_pattern && preg_match("@".$boundary_pattern."@i", $source_html, $match)) {
|
|
$source_html = $match[1];
|
|
}
|
|
|
|
if($match_once) {
|
|
if(preg_match("@".$elements["code"]."@i", $source_html, $match)){
|
|
//echo $match[1];
|
|
return array(
|
|
"result" => $this->cleanHtmlBlock($match[1], $elements),
|
|
"images" => ($find_image) ? $this->extractImages($match[1]) : null
|
|
);
|
|
}
|
|
} else {
|
|
$results = array();
|
|
if(preg_match_all("@".$elements["code"]."@", $source_html, $match)){
|
|
foreach ($match[1] as $html_block ) {
|
|
$results[] = array(
|
|
"result" => $this->cleanHtmlBlock($html_block, $elements),
|
|
"images" => ($find_image) ? $this->extractImages($html_block) : null
|
|
);
|
|
}
|
|
}
|
|
|
|
return $results;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
private function cleanHtmlBlock($html_block, $elements) {
|
|
|
|
if($elements["removed"]){
|
|
$arrayRemover = array_filter(explode(";",$elements["removed"]));
|
|
foreach($arrayRemover as $char_removed){
|
|
$char_removed = stripslashes(trim($char_removed));
|
|
$html_block = str_replace($char_removed.";","", $html_block); //sometimes
|
|
$html_block = str_replace($char_removed,"", $html_block);
|
|
$html_block = preg_replace("{".addslashes($char_removed)."}","", $html_block);
|
|
}
|
|
}
|
|
|
|
if($elements["sepa"]){
|
|
$html_block = strrchr($html_block, $elements["sepa"]);
|
|
$html_block = str_replace($elements["sepa"],"", $html_block);
|
|
}
|
|
|
|
if($elements["extra_url"] && $html_block) $html_block = $elements["extra_url"] . trim($html_block);
|
|
|
|
//thay the cum tu source bang cum tu tuy chon
|
|
if($elements["invalid"]){
|
|
$arrayReplace = array_filter(explode(";", $elements["invalid"]));
|
|
foreach($arrayReplace as $replace_group){
|
|
$replace_group = stripslashes($replace_group);
|
|
//echo $replace_group;
|
|
$replace_group_a = explode("#",$replace_group);
|
|
$html_block = str_replace(trim($replace_group_a[0]),trim($replace_group_a[1]), $html_block);
|
|
$html_block = preg_replace("{".addslashes(trim($replace_group_a[0]))."}",trim($replace_group_a[1]), $html_block);
|
|
}
|
|
}
|
|
|
|
return trim($html_block);
|
|
}
|
|
|
|
|
|
private function extractImages($source_html){
|
|
$img_match = array();
|
|
if(preg_match_all("/img(.*?)?src\s*=\s*\\\\?[\'\"]?([+:%\/\?~=&;\(\),|!._a-zA-Z0-9-]*)[\'\"]?/i", $source_html, $img_match)){
|
|
return array_unique($img_match[2]);
|
|
}
|
|
return array();
|
|
}
|
|
|
|
//$code_line =
|
|
// singlePro | chandau(.*?)chancuoi | remove_word'];word2 | sepa | prepend_after | invalid_list
|
|
// pagePro | chandau(.*?)chancuoi | remove_word'];word2 | sepa | prepend_after | invalid_list
|
|
/**
|
|
* @param $code_line string
|
|
* @return array|false
|
|
*/
|
|
private function getCodeElement($code_line){
|
|
if(!$code_line) return false;
|
|
|
|
$element = explode("|", trim($code_line));
|
|
$result = array();
|
|
if(array_key_exists(0, $element)){
|
|
$result["type"] = trim($element[0]); //pagePro, singlePro
|
|
}else $result["type"] = "";
|
|
|
|
if(array_key_exists(1, $element)){
|
|
$result["code"] = trim($element[1]);
|
|
}else $result["code"] = "";
|
|
|
|
if(array_key_exists(2,$element)){
|
|
$result["removed"] = trim($element[2]);
|
|
}else $result["removed"] = "";
|
|
|
|
if(array_key_exists(3,$element)){
|
|
$result["sepa"] = trim($element[3]);
|
|
}else $result["sepa"] = "";
|
|
|
|
if(array_key_exists(4,$element)){
|
|
$result["extra_url"] = trim($element[4]);
|
|
}else $result["extra_url"] = "";
|
|
|
|
if(array_key_exists(5,$element)){
|
|
$result["invalid"] = trim($element[5]); //for images
|
|
}else $result["invalid"] = "";
|
|
|
|
return $result;
|
|
}
|
|
|
|
|
|
private function buildFullUrl($url, $base_url){
|
|
if(!$base_url) return $url;
|
|
if(strlen($url) < 2) return "";
|
|
|
|
if(preg_match("/(http|www.|javascript|mailto|ymsgr)/i",$url)){
|
|
return $url;
|
|
}else{
|
|
return $this->convert_to_absolute( $base_url, $url );
|
|
}
|
|
}
|
|
|
|
private function convert_to_absolute($absolute, $relative) {
|
|
$p = parse_url($relative);
|
|
$first_letter = $relative[0];
|
|
$last_letter = substr($relative, strlen($relative) -1, 1);
|
|
|
|
if(array_key_exists("scheme",$p) || strpos($relative,"www.") !== false || substr($relative, 0, 2) == '//') return $relative; //it's absolute
|
|
|
|
if(in_array($first_letter,array('?',';'))) return str_replace(strrchr($absolute,$first_letter),"",$absolute) . $relative;
|
|
|
|
if($first_letter == "#") return $absolute;//already crawled this page
|
|
|
|
extract(parse_url($absolute));
|
|
|
|
$path = (isset($path)) ? $path : "";
|
|
$path = (strrchr($absolute,"/")!="/") ? ((dirname($path) != "\\") ? dirname($path) : "") : $path;
|
|
|
|
if($first_letter == '/') {
|
|
$cparts = array_filter(explode("/", $relative));
|
|
}
|
|
else {
|
|
|
|
$aparts = array_filter(explode("/", $path));
|
|
//print_r($aparts);
|
|
$rparts = array_filter(explode("/", $relative));
|
|
//print_r($rparts);
|
|
$cparts = array_merge($aparts, $rparts);
|
|
//print_r($cparts);
|
|
if(!preg_match("/[a-z0-9]/i",$first_letter)){
|
|
foreach($cparts as $i => $part) {
|
|
if($part == '.') {
|
|
$cparts[$i] = null;
|
|
}
|
|
if($part == '..') {
|
|
$cparts[$i] = '';
|
|
if(array_key_exists($i - 1,$cparts)){
|
|
if($cparts[$i - 1] != null) $cparts[$i - 1] = null;
|
|
else if(array_key_exists($i - 3,$cparts)) $cparts[$i - 3] = null; // in case ../../
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
$cparts = array_filter($cparts);
|
|
}
|
|
|
|
$path = implode("/", $cparts);
|
|
if($last_letter == '/') $path .= "/";
|
|
|
|
|
|
$url = "";
|
|
if($scheme) {
|
|
$url = "$scheme://";
|
|
}
|
|
|
|
if($host) {
|
|
$url .= "$host/";
|
|
}
|
|
$url .= $path;
|
|
return $url;
|
|
}
|
|
|
|
}
|