-
Notifications
You must be signed in to change notification settings - Fork 291
Expand file tree
/
Copy pathspiderTools.js
More file actions
406 lines (371 loc) · 12.8 KB
/
spiderTools.js
File metadata and controls
406 lines (371 loc) · 12.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
import fs from "fs-extra";
import { resolvePath, isSafePath } from "../utils/pathHelper.js";
import { decodeDsSource } from "../utils/dsHelper.js";
import { exec } from "child_process";
import util from "util";
import path from "path";
import vm from "vm";
const execPromise = util.promisify(exec);
// Import project utils
let jsoup, req;
try {
const htmlParser = await import("../../libs_drpy/htmlParser.js");
jsoup = htmlParser.jsoup;
const reqModule = await import("../../utils/req.js");
req = reqModule.default;
} catch (e) {
console.error("Warning: Failed to import project utils in spiderTools:", e.message);
}
export const list_sources = async () => {
const jsSourcesPath = resolvePath("spider/js");
const catvodSourcesPath = resolvePath("spider/catvod");
let jsSources = [];
let catvodSources = [];
if (await fs.pathExists(jsSourcesPath)) {
jsSources = (await fs.readdir(jsSourcesPath)).filter(f => f.endsWith('.js'));
}
if (await fs.pathExists(catvodSourcesPath)) {
catvodSources = (await fs.readdir(catvodSourcesPath)).filter(f => f.endsWith('.js'));
}
return {
content: [{
type: "text",
text: JSON.stringify({
"spider/js": jsSources,
"spider/catvod": catvodSources
}, null, 2)
}]
}
};
export const get_routes_info = async () => {
const indexControllerPath = resolvePath("controllers/index.js");
if (!await fs.pathExists(indexControllerPath)) {
return { content: [{ type: "text", text: "controllers/index.js not found" }] };
}
const content = await fs.readFile(indexControllerPath, "utf-8");
const lines = content.split('\n');
const registered = lines
.filter(l => l.trim().startsWith('fastify.register('))
.map(l => l.trim());
return {
content: [{
type: "text",
text: JSON.stringify({
file: "controllers/index.js",
registered_controllers: registered
}, null, 2)
}]
}
};
export const fetch_spider_url = async (args) => {
if (!req) return { isError: true, content: [{ type: "text", text: "req module not loaded" }] };
const { url, options } = args;
try {
const config = options || {};
if (!config.method) config.method = 'GET';
const res = await req(url, config);
const result = {
status: res.status,
statusText: res.statusText,
headers: res.headers,
data: res.data
};
return {
content: [{
type: "text",
text: JSON.stringify(result, null, 2)
}]
};
} catch (e) {
return {
isError: true,
content: [{ type: "text", text: `Fetch Error: ${e.message}\nResponse: ${e.response ? JSON.stringify(e.response.data) : 'No response'}` }]
};
}
};
export const debug_spider_rule = async (args) => {
if (!jsoup) return { isError: true, content: [{ type: "text", text: "jsoup module not loaded" }] };
const { html, url, rule, mode, baseUrl, options } = args;
let content = html;
let finalUrl = baseUrl || url;
if (url && !content) {
if (!req) return { isError: true, content: [{ type: "text", text: "req module not loaded for url fetch" }] };
try {
const res = await req(url, options || {});
content = typeof res.data === 'string' ? res.data : JSON.stringify(res.data);
if (!finalUrl) finalUrl = url;
} catch (e) {
return {
isError: true,
content: [{ type: "text", text: `Failed to fetch URL: ${e.message}` }]
};
}
}
if (!content) {
return {
isError: true,
content: [{ type: "text", text: "Please provide 'html' content or 'url' to fetch." }]
};
}
try {
const j = new jsoup(finalUrl || '');
let result;
if (mode === 'pdfa') {
result = j.pdfa(content, rule);
} else if (mode === 'pdfh') {
result = j.pdfh(content, rule);
} else if (mode === 'pd') {
result = j.pd(content, rule);
}
return {
content: [{
type: "text",
text: JSON.stringify({
mode,
rule,
count: Array.isArray(result) ? result.length : (result ? 1 : 0),
result
}, null, 2)
}]
};
} catch (e) {
return {
isError: true,
content: [{ type: "text", text: `Parsing Error: ${e.message}` }]
};
}
};
export const get_spider_template = async () => {
const template = `/*
* @File : drpy-node spider template
* @Author : user
* @Date : ${new Date().toISOString().split('T')[0]}
* @Comments :
*/
var rule = {
// 影视|漫画|小说
类型: '影视',
// 源标题
title: 'Site Name',
// 源主域名,可以自动处理后续链接的相对路径
host: 'https://example.com',
// 源主页链接,作为推荐的this.input
homeUrl: '/latest/',
// 源一级列表链接 (fyclass=分类, fypage=页码)
url: '/category/fyclass/page/fypage',
// 源搜索链接 (**=关键词, fypage=页码)
searchUrl: '/search?wd=**&pg=fypage',
// 允许搜索(1)、允许快搜(1)、允许筛选(1)
searchable: 2,
quickSearch: 0,
filterable: 1,
// 源默认请求头、调用await request如果参数二不填会自动添加
headers: {
'User-Agent': 'MOBILE_UA',
},
// 接口访问超时时间
timeout: 5000,
// 静态分类名称
class_name: 'Movie&TV&Anime',
// 静态分类id
class_url: '1&2&3',
// 动态分类获取 列表;标题;链接;正则提取 (可选)
// class_parse: '#side-menu:lt(1) li;a&&Text;a&&href;com/(.*?)/',
// 是否需要调用免嗅lazy函数 (服务器解析播放)
play_parse: true,
// 免嗅lazy执行函数 (如果play_parse为true则需要)
lazy: '',
// 首页推荐显示数量
limit: 6,
// 是否双层列表定位,默认false
double: true,
// 推荐列表解析: 列表;标题;图片;描述;链接
推荐: '.recommend .item;a&&title;img&&src;.remarks&&Text;a&&href',
// 一级列表解析: 列表;标题;图片;描述;链接
一级: '.list .item;a&&title;img&&src;.remarks&&Text;a&&href',
// 二级详情解析 (字典模式)
二级: {
"title": "h1&&Text",
"img": ".poster img&&src",
"desc": ".desc&&Text",
"content": ".content&&Text",
"tabs": ".tabs span", // 线路列表
"lists": ".playlists ul", // 选集列表
},
// 搜索结果解析: 列表;标题;图片;描述;链接
搜索: '.search-result .item;a&&title;img&&src;.remarks&&Text;a&&href',
/**
* 高级函数用法 (如需使用,请解除注释并替换相应字段)
* Advanced Function Usage (Uncomment and replace fields if needed)
*/
/*
// 动态获取域名 (优先级最高)
hostJs: async function () {
let {HOST} = this;
// ... perform logic ...
return HOST;
},
// 预处理 (初始化时执行一次,用于获取cookie等)
预处理: async function () {
let {HOST} = this;
// ... perform logic ...
return HOST;
},
// 自定义免嗅函数 (play_parse: true 时调用)
lazy: async function () {
let {input} = this;
// ... perform logic to get real url ...
return {
url: input,
parse: 0, // 0: 直接播放, 1: 嗅探
header: {} // 可选
};
},
// 动态分类解析 (替代 class_name/class_url)
class_parse: async function () {
let {input} = this;
// ... parse input ...
return {
class: [{type_name: '电影', type_id: '1'}],
filters: {} // 可选
};
},
// 自定义推荐列表解析 (替代字符串规则)
推荐: async function () {
let {input} = this;
// ... parse input ...
return [{
vod_name: 'Title',
vod_pic: 'Image',
vod_remarks: 'Desc',
vod_id: 'Url'
}];
},
// 自定义一级列表解析
一级: async function () {
let {input} = this;
// ... parse input ...
return [{
vod_name: 'Title',
vod_pic: 'Image',
vod_remarks: 'Desc',
vod_id: 'Url'
}];
},
// 自定义二级详情解析
二级: async function () {
let {input} = this;
// ... parse input ...
return {
vod_name: 'Title',
vod_pic: 'Image',
type_name: 'Category',
vod_year: 'Year',
vod_area: 'Area',
vod_actors: 'Actors',
vod_director: 'Director',
vod_content: 'Content',
vod_play_from: 'Line1$$$Line2', // 线路名
vod_play_url: 'Ep1$Url1#Ep2$Url2$$$Ep1$Url1...', // 播放列表
};
},
// 自定义搜索解析
搜索: async function () {
let {input} = this;
// ... parse input ...
return [{
vod_name: 'Title',
vod_pic: 'Image',
vod_remarks: 'Desc',
vod_id: 'Url'
}];
},
*/
}
`;
return {
content: [{
type: "text",
text: template
}]
}
};
export const get_drpy_libs_info = async () => {
const info = {
"Global Objects": [
"request(url, options) / req(url, options) - HTTP Request",
"post(url, options) - HTTP POST",
"pdfa(html, rule) - Parse List (Cheerio)",
"pdfh(html, rule) - Parse Html (Cheerio)",
"pd(html, rule) - Parse Url (Cheerio + urljoin)",
"log(msg) / print(msg) - Logging",
"setItem(key, value) / getItem(key) - Storage",
"urljoin(base, path) - URL Joining",
"local - Local storage object",
"input - Current input (url or content)",
"HOST - Current source host",
"rule - Current rule object"
],
"Parsing Rules": [
"Format: selector;attr1;attr2...",
"pdfa (list): Returns array. Example: '.list li;a&&title;a&&href'",
"pdfh (single): Returns string. Example: 'h1&&Text'",
"pd (url): Returns resolved URL.",
"Special syntax: && (separator), || (backup), * (all), :eq(n) (index)",
"Attributes: Text, Html, href (auto-resolves), src, data-*, etc."
]
};
return {
content: [{
type: "text",
text: JSON.stringify(info, null, 2)
}]
}
};
export const validate_spider = async (args) => {
const filePath = args?.path;
if (!filePath || !isSafePath(filePath)) {
return { isError: true, content: [{ type: "text", text: "Invalid path" }] };
}
try {
let code = await fs.readFile(resolvePath(filePath), 'utf-8');
if (filePath.endsWith('.js')) {
code = await decodeDsSource(code);
}
const sandbox = {
console: { log: () => {} }, // Mock console
require: () => {}, // Disable require
};
vm.createContext(sandbox);
// Execute code
new vm.Script(code).runInContext(sandbox);
if (!sandbox.rule) {
return { isError: true, content: [{ type: "text", text: "Missing 'rule' object in spider file." }] };
}
// Basic validation of rule object
const required = ['title', 'host', 'url'];
const missing = required.filter(k => !sandbox.rule[k]);
if (missing.length > 0) {
return { isError: true, content: [{ type: "text", text: `Missing required fields in 'rule': ${missing.join(', ')}` }] };
}
return { content: [{ type: "text", text: "Spider structure is valid." }] };
} catch (e) {
return { isError: true, content: [{ type: "text", text: `Validation Error: ${e.message}` }] };
}
};
export const check_syntax = async (args) => {
const filePath = args?.path;
if (!filePath || !isSafePath(filePath)) {
return { isError: true, content: [{ type: "text", text: "Invalid path" }] };
}
try {
let code = await fs.readFile(resolvePath(filePath), 'utf-8');
if (filePath.endsWith('.js')) {
code = await decodeDsSource(code);
}
new vm.Script(code);
return { content: [{ type: "text", text: "Syntax OK" }] };
} catch (e) {
return { isError: true, content: [{ type: "text", text: `Syntax Error: ${e.message}\n${e.stack}` }] };
}
};