hjdhnx
diff --git a/‎spider/php/B站_DZ.php‎ ‎spider/php/B站 ᵈᶻ.php‎spider/php/B站_DZ.php renamed to spider/php/B站 ᵈᶻ.php
Lines changed: 1 addition & 1 deletion b/‎spider/php/B站_DZ.php‎ ‎spider/php/B站 ᵈᶻ.php‎spider/php/B站_DZ.php renamed to spider/php/B站 ᵈᶻ.php
Lines changed: 1 addition & 1 deletion
diff --git a/‎spider/php/PHP写源(道长).pdf‎
142 KB b/‎spider/php/PHP写源(道长).pdf‎
142 KB
diff --git a/‎spider/php/config.php‎
Lines changed: 9 additions & 1 deletion b/‎spider/php/config.php‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎spider/php/lib/HtmlParser.php‎
Lines changed: 203 additions & 0 deletions b/‎spider/php/lib/HtmlParser.php‎
Lines changed: 203 additions & 0 deletions
diff --git a/‎spider/php/spider.php‎ ‎spider/php/lib/spider.php‎spider/php/spider.php renamed to spider/php/lib/spider.php
Lines changed: 73 additions & 1 deletion b/‎spider/php/spider.php‎ ‎spider/php/lib/spider.php‎spider/php/spider.php renamed to spider/php/lib/spider.php
Lines changed: 73 additions & 1 deletion
@@ -4,7 +4,7 @@
  * 按照 BaseSpider 结构重写
  */
 
-require_once __DIR__ . '/spider.php';
+require_once __DIR__ . '/lib/spider.php';
 
 class Spider extends BaseSpider {
 
 
@@ -34,7 +34,15 @@
         continue;
     }
 
-    if (in_array($file, [$self, 'index.php', 'spider.php', 'example_t4.php', 'test_runner.php'])) {
+    // 排除特定文件：
+    // 1. 系统/框架文件 (index.php, spider.php 等)
+    // 2. 当前文件 ($self)
+    // 3. 以 _ 开头的文件 (如 _backup.php)
+    // 4. config 开头的文件 (如 config_old.php)
+    if (in_array($file, ['index.php', 'spider.php', 'example_t4.php', 'test_runner.php']) ||
+        $file === $self ||
+        strpos($file, '_') === 0 ||
+        fnmatch('config*.php', $file)) {
         continue;
     }
 
 
@@ -0,0 +1,203 @@
+<?php
+
+class HtmlParser {
+    
+    /**
+     * Parse HTML and return array of OuterHTML strings
+     */
+    public function pdfa($html, $rule) {
+        if (empty($html) || empty($rule)) return [];
+        $doc = $this->getDom($html);
+        $xpath = new DOMXPath($doc);
+        
+        $xpathQuery = $this->parseRuleToXpath($rule);
+        $nodes = $xpath->query($xpathQuery);
+        
+        $res = [];
+        if ($nodes) {
+            foreach ($nodes as $node) {
+                // saveHTML($node) returns OuterHTML
+                $res[] = $doc->saveHTML($node);
+            }
+        }
+        return $res;
+    }
+
+    /**
+     * Parse HTML and return single value (Text, Html, or Attribute)
+     */
+    public function pdfh($html, $rule, $baseUrl = '') {
+        if (empty($html) || empty($rule)) return '';
+        $doc = $this->getDom($html);
+        $xpath = new DOMXPath($doc);
+
+        // Separate Option
+        $option = '';
+        if (strpos($rule, '&&') !== false) {
+            $parts = explode('&&', $rule);
+            $option = array_pop($parts);
+            $rule = implode('&&', $parts);
+        }
+
+        $xpathQuery = $this->parseRuleToXpath($rule);
+        $nodes = $xpath->query($xpathQuery);
+        
+        if ($nodes && $nodes->length > 0) {
+            // Special handling for Text option: concatenate all nodes
+            if ($option === 'Text') {
+                $text = '';
+                foreach ($nodes as $node) {
+                    $text .= $node->textContent;
+                }
+                return $this->parseText($text);
+            }
+            
+            // For other options, use the first node
+            $node = $nodes->item(0);
+            return $this->formatOutput($doc, $node, $option, $baseUrl);
+        }
+        return '';
+    }
+    
+    /**
+     * Parse HTML and return URL (auto joined)
+     */
+    public function pd($html, $rule, $baseUrl = '') {
+        $res = $this->pdfh($html, $rule, $baseUrl);
+        return $this->urlJoin($baseUrl, $res);
+    }
+
+    // --- Helper Methods ---
+
+    private function parseText($text) {
+        // Match JS behavior: 
+        // text = text.replace(/[\s]+/gm, '\n');
+        // text = text.replace(/\n+/g, '\n').replace(/^\s+/, '');
+        // text = text.replace(/\n/g, ' ');
+        
+        $text = preg_replace('/[\s]+/u', "\n", $text);
+        $text = preg_replace('/\n+/', "\n", $text);
+        $text = trim($text);
+        $text = str_replace("\n", ' ', $text);
+        return $text;
+    }
+
+    private function parseRuleToXpath($rule) {
+        // Replace && with space to unify as descendant separator
+        $rule = str_replace('&&', ' ', $rule);
+        $parts = explode(' ', $rule);
+        $xpathParts = [];
+        
+        foreach ($parts as $part) {
+            if (empty($part)) continue;
+            $xpathParts[] = $this->transSingleSelector($part);
+        }
+        
+        // Join with descendant axis
+        return '//' . implode('//', $xpathParts);
+    }
+
+    private function transSingleSelector($selector) {
+        // Handle :eq
+        $position = null;
+        if (preg_match('/:eq\((-?\d+)\)/', $selector, $matches)) {
+            $idx = intval($matches[1]);
+            $selector = str_replace($matches[0], '', $selector);
+            if ($idx >= 0) {
+                $position = $idx + 1; // XPath is 1-based
+            } else {
+                // -1 is last()
+                // -2 is last()-1
+                $offset = abs($idx) - 1;
+                $position = "last()" . ($offset > 0 ? "-$offset" : ""); 
+            }
+        }
+        
+        // Handle tag.class#id
+        $tag = '*';
+        $conditions = [];
+        
+        // Extract id
+        if (preg_match('/#([\w-]+)/', $selector, $m)) {
+            $conditions[] = '@id="' . $m[1] . '"';
+            $selector = str_replace($m[0], '', $selector);
+        }
+        
+        // Extract classes
+        if (preg_match_all('/\.([\w-]+)/', $selector, $m)) {
+            foreach ($m[1] as $cls) {
+                $conditions[] = 'contains(concat(" ", normalize-space(@class), " "), " ' . $cls . ' ")';
+            }
+            $selector = preg_replace('/\.[\w-]+/', '', $selector);
+        }
+        
+        // Remaining is tag
+        if (!empty($selector)) {
+            $tag = $selector;
+        }
+        
+        $xpath = $tag;
+        if (!empty($conditions)) {
+            $xpath .= '[' . implode(' and ', $conditions) . ']';
+        }
+        if ($position !== null) {
+            $xpath .= '[' . $position . ']';
+        }
+        
+        return $xpath;
+    }
+
+    private function formatOutput($doc, $node, $option, $baseUrl) {
+        if ($option === 'Text') {
+            return $this->parseText($node->textContent);
+        } elseif ($option === 'Html') {
+            return $doc->saveHTML($node);
+        } elseif ($option) {
+            // Attribute
+            $val = $node->getAttribute($option);
+            // Handle style url() extraction if needed? JS does it.
+            // JS: if (contains(opt, 'style') && contains(ret, 'url(')) ...
+            return $val;
+        }
+        // Default to outer HTML if no option provided
+        return $doc->saveHTML($node);
+    }
+
+    private function getDom($html) {
+        $doc = new DOMDocument();
+        // Suppress warnings for malformed HTML
+        libxml_use_internal_errors(true);
+        // Force UTF-8 encoding
+        if (!empty($html) && mb_detect_encoding($html, 'UTF-8', true) === false) {
+             $html = mb_convert_encoding($html, 'UTF-8', 'GBK, BIG5'); 
+        }
+        // Add meta charset to ensure DOMDocument treats it as UTF-8
+        $html = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">' . $html;
+        
+        $doc->loadHTML($html);
+        libxml_clear_errors();
+        return $doc;
+    }
+
+    private function urlJoin($baseUrl, $relativeUrl) {
+        if (empty($relativeUrl)) return '';
+        if (preg_match('#^https?://#', $relativeUrl)) return $relativeUrl;
+        
+        if (empty($baseUrl)) return $relativeUrl;
+
+        $parts = parse_url($baseUrl);
+        $scheme = isset($parts['scheme']) ? $parts['scheme'] . '://' : 'http://';
+        $host = isset($parts['host']) ? $parts['host'] : '';
+        
+        if (substr($relativeUrl, 0, 1) == '/') {
+            return $scheme . $host . $relativeUrl;
+        }
+        
+        // Relative path
+        $path = isset($parts['path']) ? $parts['path'] : '/';
+        $dir = rtrim(dirname($path), '/\\');
+        if ($dir === '/' || $dir === '\\') $dir = ''; // handle root
+        
+        return $scheme . $host . $dir . '/' . $relativeUrl;
+    }
+}
@@ -15,6 +15,8 @@
 error_reporting(E_ALL);
 ini_set('display_errors', '1');
 
+require_once __DIR__ . '/HtmlParser.php';
+
 abstract class BaseSpider {
 
     // 默认请求头
@@ -24,6 +26,15 @@ abstract class BaseSpider {
         'Accept-Language' => 'zh-CN,zh;q=0.9',
     ];
 
+    /**
+     * @var HtmlParser
+     */
+    protected $htmlParser;
+
+    public function __construct() {
+        $this->htmlParser = new HtmlParser();
+    }
+
     /**
      * 初始化方法
      * @param string $extend 扩展参数
@@ -113,6 +124,51 @@ public function action($action, $value) {
 
     // ================== 辅助方法 ==================
 
+    protected function pdfa($html, $rule) {
+        return $this->htmlParser->pdfa($html, $rule);
+    }
+    
+    protected function pdfh($html, $rule, $baseUrl = '') {
+        return $this->htmlParser->pdfh($html, $rule, $baseUrl);
+    }
+    
+    protected function pd($html, $rule, $baseUrl = '') {
+        if (empty($baseUrl)) {
+            $baseUrl = $this->tryGetHost();
+        }
+        return $this->htmlParser->pd($html, $rule, $baseUrl);
+    }
+
+    /**
+     * 尝试获取子类定义的 HOST 常量或属性
+     */
+    private function tryGetHost() {
+        try {
+            $ref = new ReflectionClass($this);
+
+            // 1. 尝试获取 HOST 属性 (优先)
+            if ($ref->hasProperty('HOST')) {
+                $prop = $ref->getProperty('HOST');
+                // PHP 8.1+ 默认可访问私有属性，只有旧版本需要手动开启
+                if (PHP_VERSION_ID < 80100) {
+                    $prop->setAccessible(true);
+                }
+                $val = $prop->getValue($this);
+                if (!empty($val)) {
+                    return $val;
+                }
+            }
+
+            // 2. 尝试获取 const HOST 常量
+            if ($ref->hasConstant('HOST')) {
+                return $ref->getConstant('HOST');
+            }
+        } catch (Exception $e) {
+            // ignore
+        }
+        return '';
+    }
+
     /**
      * 快速构建分页返回结果
      * @param array $list 视频列表
@@ -157,6 +213,12 @@ protected function pageResult($list, $pg, $total = 0, $limit = 20) {
      * @return string|bool
      */
     protected function fetch($url, $options = [], $headers = []) {
+        // 支持从 options 中传递 headers
+        if (isset($options['headers'])) {
+            $headers = array_merge($headers, $options['headers']);
+            unset($options['headers']);
+        }
+
         $ch = curl_init();
 
         // 1. 解析自定义 header 为关联数组
@@ -181,7 +243,12 @@ protected function fetch($url, $options = [], $headers = []) {
         // 3. 转换回 CURL 所需的索引数组
         $mergedHeaders = [];
         foreach ($finalHeadersMap as $k => $v) {
-            $mergedHeaders[] = "$k: $v";
+            if ($v === "") {
+                // To send empty header in CURL, use "Header;" (no colon)
+                $mergedHeaders[] = $k . ";";
+            } else {
+                $mergedHeaders[] = "$k: $v";
+            }
         }
 
         $defaultOptions = [
@@ -223,6 +290,11 @@ protected function fetch($url, $options = [], $headers = []) {
         return $result;
     }
 
+    protected function fetchJson($url, $options = []) {
+        $resp = $this->fetch($url, $options);
+        return json_decode($resp, true) ?: [];
+    }
+
     /**
      * 自动运行，处理路由
      */