Files
admin_hura_8/inc/Hura8/System/HtmlParser.php

246 lines
8.3 KiB
PHP
Raw Normal View History

2024-01-29 10:39:53 +07:00
<?php
/**
* Created by PhpStorm.
* Date: 11/29/17
* Time: 10:48 PM
*/
namespace Hura8\System;
class HtmlParser
{
private $source_html = '';
public function __construct() {
}
public function setSource($source_html){
$this->source_html = str_replace(array("\n", "\r", "\t")," ", $source_html);
}
//@$boundary_pattern: pattern to find a smaller boundary-block within source_html so that our content does not wander around
/**
* @param $pattern_arr_or_str array|string ['pattern1', 'pattern2'] or 'pattern'
* @param bool $match_once
* @param bool $find_image
* @param string $boundary_pattern
* @return array|bool
*/
public function extract($pattern_arr_or_str, $match_once = true, $find_image = false, $boundary_pattern = "") {
if(is_array($pattern_arr_or_str)) {
foreach ($pattern_arr_or_str as $pattern){
$result = $this->extractSinglePattern($pattern, $match_once, $find_image, $boundary_pattern);
if(is_array($result) && isset($result['result'])) {
return $result;
}
}
return false;
}
// default is string
return $this->extractSinglePattern($pattern_arr_or_str, $match_once, $find_image, $boundary_pattern);
}
/**
* @param $pattern string
* @param bool $match_once
* @param bool $find_image
* @param string $boundary_pattern
* @return array|bool
*/
protected function extractSinglePattern($pattern, $match_once = true, $find_image = false, $boundary_pattern = "") {
$elements = $this->getCodeElement($pattern);
if(!$elements["code"]) return false;
$match = array();
$source_html = $this->source_html;
//found boundary if pattern exist
if($boundary_pattern && preg_match("@".$boundary_pattern."@i", $source_html, $match)) {
$source_html = $match[1];
}
if($match_once) {
if(preg_match("@".$elements["code"]."@i", $source_html, $match)){
//echo $match[1];
return array(
"result" => $this->cleanHtmlBlock($match[1], $elements),
"images" => ($find_image) ? $this->extractImages($match[1]) : null
);
}
} else {
$results = array();
if(preg_match_all("@".$elements["code"]."@", $source_html, $match)){
foreach ($match[1] as $html_block ) {
$results[] = array(
"result" => $this->cleanHtmlBlock($html_block, $elements),
"images" => ($find_image) ? $this->extractImages($html_block) : null
);
}
}
return $results;
}
return false;
}
private function cleanHtmlBlock($html_block, $elements) {
if($elements["removed"]){
$arrayRemover = array_filter(explode(";",$elements["removed"]));
foreach($arrayRemover as $char_removed){
$char_removed = stripslashes(trim($char_removed));
$html_block = str_replace($char_removed.";","", $html_block); //sometimes &nbsp;
$html_block = str_replace($char_removed,"", $html_block);
$html_block = preg_replace("{".addslashes($char_removed)."}","", $html_block);
}
}
if($elements["sepa"]){
$html_block = strrchr($html_block, $elements["sepa"]);
$html_block = str_replace($elements["sepa"],"", $html_block);
}
if($elements["extra_url"] && $html_block) $html_block = $elements["extra_url"] . trim($html_block);
//thay the cum tu source bang cum tu tuy chon
if($elements["invalid"]){
$arrayReplace = array_filter(explode(";", $elements["invalid"]));
foreach($arrayReplace as $replace_group){
$replace_group = stripslashes($replace_group);
//echo $replace_group;
$replace_group_a = explode("#",$replace_group);
$html_block = str_replace(trim($replace_group_a[0]),trim($replace_group_a[1]), $html_block);
$html_block = preg_replace("{".addslashes(trim($replace_group_a[0]))."}",trim($replace_group_a[1]), $html_block);
}
}
return trim($html_block);
}
private function extractImages($source_html){
$img_match = array();
if(preg_match_all("/img(.*?)?src\s*=\s*\\\\?[\'\"]?([+:%\/\?~=&;\(\),|!._a-zA-Z0-9-]*)[\'\"]?/i", $source_html, $img_match)){
return array_unique($img_match[2]);
}
return array();
}
//$code_line =
// singlePro | chandau(.*?)chancuoi | remove_word'];word2 | sepa | prepend_after | invalid_list
// pagePro | chandau(.*?)chancuoi | remove_word'];word2 | sepa | prepend_after | invalid_list
/**
* @param $code_line string
* @return array|false
*/
private function getCodeElement($code_line){
if(!$code_line) return false;
$element = explode("|", trim($code_line));
$result = array();
if(array_key_exists(0, $element)){
$result["type"] = trim($element[0]); //pagePro, singlePro
}else $result["type"] = "";
if(array_key_exists(1, $element)){
$result["code"] = trim($element[1]);
}else $result["code"] = "";
if(array_key_exists(2,$element)){
$result["removed"] = trim($element[2]);
}else $result["removed"] = "";
if(array_key_exists(3,$element)){
$result["sepa"] = trim($element[3]);
}else $result["sepa"] = "";
if(array_key_exists(4,$element)){
$result["extra_url"] = trim($element[4]);
}else $result["extra_url"] = "";
if(array_key_exists(5,$element)){
$result["invalid"] = trim($element[5]); //for images
}else $result["invalid"] = "";
return $result;
}
private function buildFullUrl($url, $base_url){
if(!$base_url) return $url;
if(strlen($url) < 2) return "";
if(preg_match("/(http|www.|javascript|mailto|ymsgr)/i",$url)){
return $url;
}else{
return $this->convert_to_absolute( $base_url, $url );
}
}
private function convert_to_absolute($absolute, $relative) {
$p = parse_url($relative);
$first_letter = $relative[0];
$last_letter = substr($relative, strlen($relative) -1, 1);
if(array_key_exists("scheme",$p) || strpos($relative,"www.") !== false || substr($relative, 0, 2) == '//') return $relative; //it's absolute
if(in_array($first_letter,array('?',';'))) return str_replace(strrchr($absolute,$first_letter),"",$absolute) . $relative;
if($first_letter == "#") return $absolute;//already crawled this page
extract(parse_url($absolute));
$path = (isset($path)) ? $path : "";
$path = (strrchr($absolute,"/")!="/") ? ((dirname($path) != "\\") ? dirname($path) : "") : $path;
if($first_letter == '/') {
$cparts = array_filter(explode("/", $relative));
}
else {
$aparts = array_filter(explode("/", $path));
//print_r($aparts);
$rparts = array_filter(explode("/", $relative));
//print_r($rparts);
$cparts = array_merge($aparts, $rparts);
//print_r($cparts);
if(!preg_match("/[a-z0-9]/i",$first_letter)){
foreach($cparts as $i => $part) {
if($part == '.') {
$cparts[$i] = null;
}
if($part == '..') {
$cparts[$i] = '';
if(array_key_exists($i - 1,$cparts)){
if($cparts[$i - 1] != null) $cparts[$i - 1] = null;
else if(array_key_exists($i - 3,$cparts)) $cparts[$i - 3] = null; // in case ../../
}
}
}
}
$cparts = array_filter($cparts);
}
$path = implode("/", $cparts);
if($last_letter == '/') $path .= "/";
$url = "";
if($scheme) {
$url = "$scheme://";
}
if($host) {
$url .= "$host/";
}
$url .= $path;
return $url;
}
}