getDom($html); $xpath = new DOMXPath($doc); $xpathQuery = $this->parseRuleToXpath($rule); $nodes = $xpath->query($xpathQuery); $res = []; if ($nodes) { foreach ($nodes as $node) { // saveHTML($node) returns OuterHTML $res[] = $doc->saveHTML($node); } } return $res; } /** * Parse HTML and return single value (Text, Html, or Attribute) */ public function pdfh($html, $rule, $baseUrl = '') { if (empty($html) || empty($rule)) return ''; $doc = $this->getDom($html); $xpath = new DOMXPath($doc); // Separate Option $option = ''; if (strpos($rule, '&&') !== false) { $parts = explode('&&', $rule); $option = array_pop($parts); $rule = implode('&&', $parts); } $xpathQuery = $this->parseRuleToXpath($rule); $nodes = $xpath->query($xpathQuery); if ($nodes && $nodes->length > 0) { // Special handling for Text option: concatenate all nodes if ($option === 'Text') { $text = ''; foreach ($nodes as $node) { $text .= $node->textContent; } return $this->parseText($text); } // For other options, use the first node $node = $nodes->item(0); return $this->formatOutput($doc, $node, $option, $baseUrl); } return ''; } /** * Parse HTML and return URL (auto joined) */ public function pd($html, $rule, $baseUrl = '') { $res = $this->pdfh($html, $rule, $baseUrl); return $this->urlJoin($baseUrl, $res); } // --- Helper Methods --- private function parseText($text) { // Match JS behavior: // text = text.replace(/[\s]+/gm, '\n'); // text = text.replace(/\n+/g, '\n').replace(/^\s+/, ''); // text = text.replace(/\n/g, ' '); $text = preg_replace('/[\s]+/u', "\n", $text); $text = preg_replace('/\n+/', "\n", $text); $text = trim($text); $text = str_replace("\n", ' ', $text); return $text; } private function parseRuleToXpath($rule) { // Replace && with space to unify as descendant separator $rule = str_replace('&&', ' ', $rule); $parts = explode(' ', $rule); $xpathParts = []; foreach ($parts as $part) { if (empty($part)) continue; $xpathParts[] = $this->transSingleSelector($part); } // Join with descendant axis return '//' . implode('//', $xpathParts); } private function transSingleSelector($selector) { // Handle :eq $position = null; if (preg_match('/:eq\((-?\d+)\)/', $selector, $matches)) { $idx = intval($matches[1]); $selector = str_replace($matches[0], '', $selector); if ($idx >= 0) { $position = $idx + 1; // XPath is 1-based } else { // -1 is last() // -2 is last()-1 $offset = abs($idx) - 1; $position = "last()" . ($offset > 0 ? "-$offset" : ""); } } // Handle tag.class#id $tag = '*'; $conditions = []; // Extract id if (preg_match('/#([\w-]+)/', $selector, $m)) { $conditions[] = '@id="' . $m[1] . '"'; $selector = str_replace($m[0], '', $selector); } // Extract classes if (preg_match_all('/\.([\w-]+)/', $selector, $m)) { foreach ($m[1] as $cls) { $conditions[] = 'contains(concat(" ", normalize-space(@class), " "), " ' . $cls . ' ")'; } $selector = preg_replace('/\.[\w-]+/', '', $selector); } // Remaining is tag if (!empty($selector)) { $tag = $selector; } $xpath = $tag; if (!empty($conditions)) { $xpath .= '[' . implode(' and ', $conditions) . ']'; } if ($position !== null) { $xpath .= '[' . $position . ']'; } return $xpath; } private function formatOutput($doc, $node, $option, $baseUrl) { if ($option === 'Text') { return $this->parseText($node->textContent); } elseif ($option === 'Html') { return $doc->saveHTML($node); } elseif ($option) { // Attribute $val = $node->getAttribute($option); // Handle style url() extraction if needed? JS does it. // JS: if (contains(opt, 'style') && contains(ret, 'url(')) ... return $val; } // Default to outer HTML if no option provided return $doc->saveHTML($node); } private function getDom($html) { $doc = new DOMDocument(); // Suppress warnings for malformed HTML libxml_use_internal_errors(true); // Force UTF-8 encoding if (!empty($html) && mb_detect_encoding($html, 'UTF-8', true) === false) { $html = mb_convert_encoding($html, 'UTF-8', 'GBK, BIG5'); } // Add meta charset to ensure DOMDocument treats it as UTF-8 $html = '' . $html; $doc->loadHTML($html); libxml_clear_errors(); return $doc; } private function urlJoin($baseUrl, $relativeUrl) { if (empty($relativeUrl)) return ''; if (preg_match('#^https?://#', $relativeUrl)) return $relativeUrl; if (empty($baseUrl)) return $relativeUrl; $parts = parse_url($baseUrl); $scheme = isset($parts['scheme']) ? $parts['scheme'] . '://' : 'http://'; $host = isset($parts['host']) ? $parts['host'] : ''; // Handle protocol-relative URLs (starting with //) if (substr($relativeUrl, 0, 2) == '//') { return (isset($parts['scheme']) ? $parts['scheme'] . ':' : 'http:') . $relativeUrl; } if (substr($relativeUrl, 0, 1) == '/') { return $scheme . $host . $relativeUrl; } // Relative path $path = isset($parts['path']) ? $parts['path'] : '/'; $dir = rtrim(dirname($path), '/\\'); if ($dir === '/' || $dir === '\\') $dir = ''; // handle root return $scheme . $host . $dir . '/' . $relativeUrl; } }