Skip to content

Commit f9011da

Browse files
author
Taois
committed
feat: 完善php标准
1 parent 9c9ad92 commit f9011da

23 files changed

+1382
-73
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* 按照 BaseSpider 结构重写
55
*/
66

7-
require_once __DIR__ . '/spider.php';
7+
require_once __DIR__ . '/lib/spider.php';
88

99
class Spider extends BaseSpider {
1010

spider/php/PHP写源(道长).pdf

142 KB
Binary file not shown.

spider/php/config.php

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,15 @@
3434
continue;
3535
}
3636

37-
if (in_array($file, [$self, 'index.php', 'spider.php', 'example_t4.php', 'test_runner.php'])) {
37+
// 排除特定文件:
38+
// 1. 系统/框架文件 (index.php, spider.php 等)
39+
// 2. 当前文件 ($self)
40+
// 3. 以 _ 开头的文件 (如 _backup.php)
41+
// 4. config 开头的文件 (如 config_old.php)
42+
if (in_array($file, ['index.php', 'spider.php', 'example_t4.php', 'test_runner.php']) ||
43+
$file === $self ||
44+
strpos($file, '_') === 0 ||
45+
fnmatch('config*.php', $file)) {
3846
continue;
3947
}
4048

spider/php/lib/HtmlParser.php

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
<?php
2+
3+
class HtmlParser {
4+
5+
/**
6+
* Parse HTML and return array of OuterHTML strings
7+
*/
8+
public function pdfa($html, $rule) {
9+
if (empty($html) || empty($rule)) return [];
10+
$doc = $this->getDom($html);
11+
$xpath = new DOMXPath($doc);
12+
13+
$xpathQuery = $this->parseRuleToXpath($rule);
14+
$nodes = $xpath->query($xpathQuery);
15+
16+
$res = [];
17+
if ($nodes) {
18+
foreach ($nodes as $node) {
19+
// saveHTML($node) returns OuterHTML
20+
$res[] = $doc->saveHTML($node);
21+
}
22+
}
23+
return $res;
24+
}
25+
26+
/**
27+
* Parse HTML and return single value (Text, Html, or Attribute)
28+
*/
29+
public function pdfh($html, $rule, $baseUrl = '') {
30+
if (empty($html) || empty($rule)) return '';
31+
$doc = $this->getDom($html);
32+
$xpath = new DOMXPath($doc);
33+
34+
// Separate Option
35+
$option = '';
36+
if (strpos($rule, '&&') !== false) {
37+
$parts = explode('&&', $rule);
38+
$option = array_pop($parts);
39+
$rule = implode('&&', $parts);
40+
}
41+
42+
$xpathQuery = $this->parseRuleToXpath($rule);
43+
$nodes = $xpath->query($xpathQuery);
44+
45+
if ($nodes && $nodes->length > 0) {
46+
// Special handling for Text option: concatenate all nodes
47+
if ($option === 'Text') {
48+
$text = '';
49+
foreach ($nodes as $node) {
50+
$text .= $node->textContent;
51+
}
52+
return $this->parseText($text);
53+
}
54+
55+
// For other options, use the first node
56+
$node = $nodes->item(0);
57+
return $this->formatOutput($doc, $node, $option, $baseUrl);
58+
}
59+
return '';
60+
}
61+
62+
/**
63+
* Parse HTML and return URL (auto joined)
64+
*/
65+
public function pd($html, $rule, $baseUrl = '') {
66+
$res = $this->pdfh($html, $rule, $baseUrl);
67+
return $this->urlJoin($baseUrl, $res);
68+
}
69+
70+
// --- Helper Methods ---
71+
72+
private function parseText($text) {
73+
// Match JS behavior:
74+
// text = text.replace(/[\s]+/gm, '\n');
75+
// text = text.replace(/\n+/g, '\n').replace(/^\s+/, '');
76+
// text = text.replace(/\n/g, ' ');
77+
78+
$text = preg_replace('/[\s]+/u', "\n", $text);
79+
$text = preg_replace('/\n+/', "\n", $text);
80+
$text = trim($text);
81+
$text = str_replace("\n", ' ', $text);
82+
return $text;
83+
}
84+
85+
private function parseRuleToXpath($rule) {
86+
// Replace && with space to unify as descendant separator
87+
$rule = str_replace('&&', ' ', $rule);
88+
$parts = explode(' ', $rule);
89+
$xpathParts = [];
90+
91+
foreach ($parts as $part) {
92+
if (empty($part)) continue;
93+
$xpathParts[] = $this->transSingleSelector($part);
94+
}
95+
96+
// Join with descendant axis
97+
return '//' . implode('//', $xpathParts);
98+
}
99+
100+
private function transSingleSelector($selector) {
101+
// Handle :eq
102+
$position = null;
103+
if (preg_match('/:eq\((-?\d+)\)/', $selector, $matches)) {
104+
$idx = intval($matches[1]);
105+
$selector = str_replace($matches[0], '', $selector);
106+
if ($idx >= 0) {
107+
$position = $idx + 1; // XPath is 1-based
108+
} else {
109+
// -1 is last()
110+
// -2 is last()-1
111+
$offset = abs($idx) - 1;
112+
$position = "last()" . ($offset > 0 ? "-$offset" : "");
113+
}
114+
}
115+
116+
// Handle tag.class#id
117+
$tag = '*';
118+
$conditions = [];
119+
120+
// Extract id
121+
if (preg_match('/#([\w-]+)/', $selector, $m)) {
122+
$conditions[] = '@id="' . $m[1] . '"';
123+
$selector = str_replace($m[0], '', $selector);
124+
}
125+
126+
// Extract classes
127+
if (preg_match_all('/\.([\w-]+)/', $selector, $m)) {
128+
foreach ($m[1] as $cls) {
129+
$conditions[] = 'contains(concat(" ", normalize-space(@class), " "), " ' . $cls . ' ")';
130+
}
131+
$selector = preg_replace('/\.[\w-]+/', '', $selector);
132+
}
133+
134+
// Remaining is tag
135+
if (!empty($selector)) {
136+
$tag = $selector;
137+
}
138+
139+
$xpath = $tag;
140+
if (!empty($conditions)) {
141+
$xpath .= '[' . implode(' and ', $conditions) . ']';
142+
}
143+
if ($position !== null) {
144+
$xpath .= '[' . $position . ']';
145+
}
146+
147+
return $xpath;
148+
}
149+
150+
private function formatOutput($doc, $node, $option, $baseUrl) {
151+
if ($option === 'Text') {
152+
return $this->parseText($node->textContent);
153+
} elseif ($option === 'Html') {
154+
return $doc->saveHTML($node);
155+
} elseif ($option) {
156+
// Attribute
157+
$val = $node->getAttribute($option);
158+
// Handle style url() extraction if needed? JS does it.
159+
// JS: if (contains(opt, 'style') && contains(ret, 'url(')) ...
160+
return $val;
161+
}
162+
// Default to outer HTML if no option provided
163+
return $doc->saveHTML($node);
164+
}
165+
166+
private function getDom($html) {
167+
$doc = new DOMDocument();
168+
// Suppress warnings for malformed HTML
169+
libxml_use_internal_errors(true);
170+
// Force UTF-8 encoding
171+
if (!empty($html) && mb_detect_encoding($html, 'UTF-8', true) === false) {
172+
$html = mb_convert_encoding($html, 'UTF-8', 'GBK, BIG5');
173+
}
174+
// Add meta charset to ensure DOMDocument treats it as UTF-8
175+
$html = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">' . $html;
176+
177+
$doc->loadHTML($html);
178+
libxml_clear_errors();
179+
return $doc;
180+
}
181+
182+
private function urlJoin($baseUrl, $relativeUrl) {
183+
if (empty($relativeUrl)) return '';
184+
if (preg_match('#^https?://#', $relativeUrl)) return $relativeUrl;
185+
186+
if (empty($baseUrl)) return $relativeUrl;
187+
188+
$parts = parse_url($baseUrl);
189+
$scheme = isset($parts['scheme']) ? $parts['scheme'] . '://' : 'http://';
190+
$host = isset($parts['host']) ? $parts['host'] : '';
191+
192+
if (substr($relativeUrl, 0, 1) == '/') {
193+
return $scheme . $host . $relativeUrl;
194+
}
195+
196+
// Relative path
197+
$path = isset($parts['path']) ? $parts['path'] : '/';
198+
$dir = rtrim(dirname($path), '/\\');
199+
if ($dir === '/' || $dir === '\\') $dir = ''; // handle root
200+
201+
return $scheme . $host . $dir . '/' . $relativeUrl;
202+
}
203+
}
Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
error_reporting(E_ALL);
1616
ini_set('display_errors', '1');
1717

18+
require_once __DIR__ . '/HtmlParser.php';
19+
1820
abstract class BaseSpider {
1921

2022
// 默认请求头
@@ -24,6 +26,15 @@ abstract class BaseSpider {
2426
'Accept-Language' => 'zh-CN,zh;q=0.9',
2527
];
2628

29+
/**
30+
* @var HtmlParser
31+
*/
32+
protected $htmlParser;
33+
34+
public function __construct() {
35+
$this->htmlParser = new HtmlParser();
36+
}
37+
2738
/**
2839
* 初始化方法
2940
* @param string $extend 扩展参数
@@ -113,6 +124,51 @@ public function action($action, $value) {
113124

114125
// ================== 辅助方法 ==================
115126

127+
protected function pdfa($html, $rule) {
128+
return $this->htmlParser->pdfa($html, $rule);
129+
}
130+
131+
protected function pdfh($html, $rule, $baseUrl = '') {
132+
return $this->htmlParser->pdfh($html, $rule, $baseUrl);
133+
}
134+
135+
protected function pd($html, $rule, $baseUrl = '') {
136+
if (empty($baseUrl)) {
137+
$baseUrl = $this->tryGetHost();
138+
}
139+
return $this->htmlParser->pd($html, $rule, $baseUrl);
140+
}
141+
142+
/**
143+
* 尝试获取子类定义的 HOST 常量或属性
144+
*/
145+
private function tryGetHost() {
146+
try {
147+
$ref = new ReflectionClass($this);
148+
149+
// 1. 尝试获取 HOST 属性 (优先)
150+
if ($ref->hasProperty('HOST')) {
151+
$prop = $ref->getProperty('HOST');
152+
// PHP 8.1+ 默认可访问私有属性,只有旧版本需要手动开启
153+
if (PHP_VERSION_ID < 80100) {
154+
$prop->setAccessible(true);
155+
}
156+
$val = $prop->getValue($this);
157+
if (!empty($val)) {
158+
return $val;
159+
}
160+
}
161+
162+
// 2. 尝试获取 const HOST 常量
163+
if ($ref->hasConstant('HOST')) {
164+
return $ref->getConstant('HOST');
165+
}
166+
} catch (Exception $e) {
167+
// ignore
168+
}
169+
return '';
170+
}
171+
116172
/**
117173
* 快速构建分页返回结果
118174
* @param array $list 视频列表
@@ -157,6 +213,12 @@ protected function pageResult($list, $pg, $total = 0, $limit = 20) {
157213
* @return string|bool
158214
*/
159215
protected function fetch($url, $options = [], $headers = []) {
216+
// 支持从 options 中传递 headers
217+
if (isset($options['headers'])) {
218+
$headers = array_merge($headers, $options['headers']);
219+
unset($options['headers']);
220+
}
221+
160222
$ch = curl_init();
161223

162224
// 1. 解析自定义 header 为关联数组
@@ -181,7 +243,12 @@ protected function fetch($url, $options = [], $headers = []) {
181243
// 3. 转换回 CURL 所需的索引数组
182244
$mergedHeaders = [];
183245
foreach ($finalHeadersMap as $k => $v) {
184-
$mergedHeaders[] = "$k: $v";
246+
if ($v === "") {
247+
// To send empty header in CURL, use "Header;" (no colon)
248+
$mergedHeaders[] = $k . ";";
249+
} else {
250+
$mergedHeaders[] = "$k: $v";
251+
}
185252
}
186253

187254
$defaultOptions = [
@@ -223,6 +290,11 @@ protected function fetch($url, $options = [], $headers = []) {
223290
return $result;
224291
}
225292

293+
protected function fetchJson($url, $options = []) {
294+
$resp = $this->fetch($url, $options);
295+
return json_decode($resp, true) ?: [];
296+
}
297+
226298
/**
227299
* 自动运行,处理路由
228300
*/

0 commit comments

Comments
 (0)