-
Notifications
You must be signed in to change notification settings - Fork 283
Expand file tree
/
Copy pathbase_spider.py
More file actions
425 lines (393 loc) · 15 KB
/
base_spider.py
File metadata and controls
425 lines (393 loc) · 15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File : base_spider.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Author's Blog: https://blog.csdn.net/qq_32394351
# Date : 2024/1/7
import os.path
import sys
sys.path.append('..')
try:
# from base.spider import Spider as BaseSpider
from base.spider import BaseSpider
except ImportError:
from t4.base.spider import BaseSpider
import json
import time
import base64
import re
from pathlib import Path
import io
import tokenize
from urllib.parse import quote
"""
配置示例:
t4的配置里ext节点会自动变成api对应query参数extend,但t4的ext字符串不支持路径格式,比如./开头或者.json结尾
api里会自动含有ext参数是base64编码后的选中的筛选条件
{
"key":"hipy_t4_base_spider",
"name":"base_spider(hipy_t4)",
"type":4,
"api":"http://192.168.31.49:5707/api/v1/vod/base_spider",
"searchable":1,
"quickSearch":0,
"filterable":1,
"ext":"base_spider"
},
{
"key": "hipy_t3_base_spider",
"name": "base_spider(hipy_t3)",
"type": 3,
"api": "{{host}}/txt/hipy/base_spider.py",
"searchable": 1,
"quickSearch": 0,
"filterable": 1,
"ext": "{{host}}/txt/hipy/base_spider.json"
},
"""
class Spider(BaseSpider): # 元类 默认的元类 type
def getName(self):
return "规则名称如:基础示例"
def init_api_ext_file(self):
"""
这个函数用于初始化py文件对应的json文件,用于存筛选规则。
执行此函数会自动生成筛选文件
@return:
"""
ext_file = __file__.replace('.py', '.json')
print(f'ext_file:{ext_file}')
ext_file_dict = {
"分类1": [{"key": "letter", "name": "首字母", "value": [{"n": "A", "v": "A"}, {"n": "B", "v": "B"}]}],
"分类2": [{"key": "letter", "name": "首字母", "value": [{"n": "A", "v": "A"}, {"n": "B", "v": "B"}]},
{"key": "year", "name": "年份",
"value": [{"n": "2024", "v": "2024"}, {"n": "2023", "v": "2023"}]}],
}
with open(ext_file, mode='w+', encoding='utf-8') as f:
f.write(json.dumps(ext_file_dict, ensure_ascii=False))
def init(self, extend=""):
"""
初始化加载extend,一般与py文件名同名的json文件作为扩展筛选
@param extend:
@return:
"""
def init_file(ext_file):
"""
根据与py对应的json文件去扩展规则的筛选条件
"""
ext_file = Path(ext_file).as_posix()
if os.path.exists(ext_file):
with open(ext_file, mode='r', encoding='utf-8') as f:
try:
ext_dict = json.loads(f.read())
self.config['filter'].update(ext_dict)
except Exception as e:
print(f'更新扩展筛选条件发生错误:{e}')
ext = self.extend
print(f"============ext:{ext},extend:{extend}============")
if isinstance(ext, str) and ext:
if ext.startswith('./'):
ext_file = os.path.join(os.path.dirname(__file__), ext)
init_file(ext_file)
elif ext.startswith('http'):
try:
r = self.fetch(ext)
self.config['filter'].update(r.json())
except Exception as e:
print(f'更新扩展筛选条件发生错误:{e}')
elif not ext.startswith('./') and not ext.startswith('http'):
ext_file = os.path.join(os.path.dirname(__file__), './' + ext + '.json')
init_file(ext_file)
# 装载模块,这里只要一个就够了
if isinstance(extend, list):
for lib in extend:
if '.Spider' in str(type(lib)):
self.module = lib
break
def isVideoFormat(self, url):
pass
def manualVideoCheck(self):
pass
def homeContent(self, filterable=False):
"""
获取首页分类及筛选数据
@param filterable: 能否筛选,跟t3/t4配置里的filterable参数一致
@return:
"""
class_name = '电影&电视剧&综艺&动漫' # 静态分类名称拼接
class_url = '1&2&3&4' # 静态分类标识拼接
result = {}
classes = []
if all([class_name, class_url]):
class_names = class_name.split('&')
class_urls = class_url.split('&')
cnt = min(len(class_urls), len(class_names))
for i in range(cnt):
classes.append({
'type_name': class_names[i],
'type_id': class_urls[i]
})
result['class'] = classes
if filterable:
result['filters'] = self.config['filter']
return result
def homeVideoContent(self):
"""
首页推荐列表
@return:
"""
d = []
d.append({
'vod_name': '测试',
'vod_id': 'index.html',
'vod_pic': 'https://gitee.com/CherishRx/imagewarehouse/raw/master/image/13096725fe56ce9cf643a0e4cd0c159c.gif',
'vod_remarks': '原始hipy',
})
result = {
'list': d
}
return result
def categoryContent(self, tid, pg, filterable, extend):
"""
返回一级列表页数据
@param tid: 分类id
@param pg: 当前页数
@param filterable: 能否筛选
@param extend: 当前筛选数据
@return:
"""
page_count = 24 # 默认赋值一页列表24条数据
d = []
d.append({
'vod_name': '测试',
'vod_id': 'index.html',
'vod_pic': 'https://gitee.com/CherishRx/imagewarehouse/raw/master/image/13096725fe56ce9cf643a0e4cd0c159c.gif',
'vod_remarks': '类型:' + tid,
})
result = {
'list': d,
'page': pg,
'pagecount': 9999 if len(d) >= page_count else pg,
'limit': 90,
'total': 999999,
}
return result
def detailContent(self, ids):
"""
返回二级详情页数据
@param ids: 一级传过来的vod_id列表
@return:
"""
vod_id = ids[0]
vod = {"vod_id": vod_id,
"vod_name": '测试二级',
"vod_pic": 'https://gitee.com/CherishRx/imagewarehouse/raw/master/image/13096725fe56ce9cf643a0e4cd0c159c.gif',
"type_name": '详情页类型',
"vod_year": '详情页年份',
"vod_area": '详情页地区',
"vod_remarks": '详情页标签',
"vod_actor": '详情页演员名称',
"vod_director": '详情页导演名称',
"vod_content": '详情页剧情描述',
"vod_play_from": '测试线路1$$$测试线路2',
"vod_play_url": '选集播放1$1.mp4#选集播放2$2.mp4$$$选集播放3$3.mp4#选集播放4$4.mp4'}
result = {
'list': [vod]
}
return result
def searchContent(self, wd, quick=False, pg=1):
"""
返回搜索列表
@param wd: 搜索关键词
@param quick: 是否来自快速搜索。t3/t4配置里启用了快速搜索,在快速搜索在执行才会是True
@return:
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
"Host": "www.bttwo.net",
"Referer": "https://www.bttwo.net/"
}
url = f'https://www.bttwo.net/xssearch?q={quote(wd)}'
r = self.fetch(url, headers=headers)
cookies = ['myannoun=1']
for key, value in r.headers.items():
if str(key).lower() == 'set-cookie':
cookies.append(value.split(';')[0])
new_headers = {
'Cookie': ';'.join(cookies),
# 'Pragma': 'no-cache',
# 'Origin': 'https://www.bttwo.net',
# 'Referer': url,
# 'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
# 'Sec-Ch-Ua-Mobile': '?0',
# 'Sec-Ch-Ua-Platform': '"Windows"',
# 'Sec-Fetch-Dest': 'document',
# 'Sec-Fetch-Mode': 'navigate',
# 'Sec-Fetch-Site': 'same-origin',
# 'Sec-Fetch-User': '?1',
# 'Upgrade-Insecure-Requests': '1',
}
headers.update(new_headers)
print(headers)
html = self.html(r.text)
captcha = ''.join(html.xpath('//*[@class="erphp-search-captcha"]/form/text()')).strip()
print('验证码:', captcha)
answer = self.eval_computer(captcha)
print('回答:', captcha, answer)
data = {'result': str(answer)}
print('待post数据:', data)
self.post(url, data=data, headers=headers, cookies=None)
r = self.fetch(url, headers=headers)
# print(r.text)
html = self.html(r.text)
lis = html.xpath('//*[contains(@class,"search_list")]/ul/li')
print('搜索结果数:', len(lis))
d = []
if len(lis) < 1:
d.append({
'vod_name': wd,
'vod_id': 'index.html',
'vod_pic': 'https://gitee.com/CherishRx/imagewarehouse/raw/master/image/13096725fe56ce9cf643a0e4cd0c159c.gif',
'vod_remarks': '测试搜索',
})
else:
for li in lis:
d.append({
'vod_name': ''.join(li.xpath('h3//text()')),
'vod_id': ''.join(li.xpath('a/@href')),
'vod_pic': ''.join(li.xpath('a/img/@data-original')),
'vod_remarks': ''.join(li.xpath('p//text()')),
})
result = {
'list': d
}
print(result)
return result
def playerContent(self, flag, id, vipFlags):
"""
解析播放,返回json。壳子视情况播放直链或进行嗅探
@param flag: vod_play_from 播放来源线路
@param id: vod_play_url 播放的链接
@param vipFlags: vip标识
@return:
"""
# url = 'http://bizcommon.alicdn.com/l2nDqpMmn6DGHnWzZQA/Cg9qI5imMInpPvK5Mnm%40%40hd.m3u8'
url = 'https://s1.bfzycdn.com/video/renmindemingyi/%E7%AC%AC07%E9%9B%86/index.m3u8'
parse = 0
headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
}
result = {
'parse': parse, # 1=嗅探,0=播放
'playUrl': '', # 解析链接
'url': url, # 直链或待嗅探地址
'header': headers, # 播放UA
}
return result
@staticmethod
def adRemove():
return 'reg:/video/adjump.*?ts'
config = {
"player": {},
"filter": {}
}
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
"Host": "www.baidu.com",
"Referer": "https://www.baidu.com/"
}
def localProxy(self, params):
# http://192.168.31.49:5707/api/v1/vod/哔滴影视?proxy=1&do=py&type=1.m3u8
print(params)
content = """
#EXTM3U
#EXT-X-VERSION:3
#EXT-X-ALLOW-CACHE:YES
#EXT-X-MEDIA-SEQUENCE:170471784
#EXT-X-TARGETDURATION:10
#EXT-X-PROGRAM-DATE-TIME:2024-01-11T20:43:53+08:00
#EXTINF:10.000, no desc
http://gctxyc.liveplay.myqcloud.com/gc/gllj01_1_md-170471784.ts
#EXT-X-PROGRAM-DATE-TIME:2024-01-11T20:44:03+08:00
#EXTINF:10.000, no desc
http://gctxyc.liveplay.myqcloud.com/gc/gllj01_1_md-170471785.ts
#EXT-X-PROGRAM-DATE-TIME:2024-01-11T20:44:13+08:00
#EXTINF:10.000, no desc
http://gctxyc.liveplay.myqcloud.com/gc/gllj01_1_md-170471786.ts
#EXT-X-PROGRAM-DATE-TIME:2024-01-11T20:44:23+08:00
#EXTINF:10.000, no desc
http://gctxyc.liveplay.myqcloud.com/gc/gllj01_1_md-170471787.ts
""".strip()
return [200, 'text/plain', content]
# return [404, 'text/plain', 'Not Found']
# return [200, "video/MP2T", content]
# return [200, "video/MP2T", ""]
# -----------------------------------------------自定义函数-----------------------------------------------
def eval_computer(self, text):
"""
自定义的字符串安全计算器
@param text:字符串的加减乘除
@return:计算后得到的值
"""
localdict = {}
self.safe_eval(f'ret={text.replace("=", "")}', localdict)
ret = localdict.get('ret') or None
return ret
def safe_eval(self, code: str = '', localdict: dict = None):
code = code.strip()
if not code:
return {}
if localdict is None:
localdict = {}
builtins = __builtins__
if not isinstance(builtins, dict):
builtins = builtins.__dict__.copy()
else:
builtins = builtins.copy()
for key in ['__import__', 'eval', 'exec', 'globals', 'dir', 'copyright', 'open', 'quit']:
del builtins[key] # 删除不安全的关键字
# print(builtins)
global_dict = {'__builtins__': builtins,
'json': json, 'print': print,
're': re, 'time': time, 'base64': base64
} # 禁用内置函数,不允许导入包
try:
self.check_unsafe_attributes(code)
exec(code, global_dict, localdict)
return localdict
except Exception as e:
return {'error': f'执行报错:{e}'}
# ==================== 静态函数 ======================
@staticmethod
def check_unsafe_attributes(string):
"""
安全检测需要exec执行的python代码
:param string:
:return:
"""
g = tokenize.tokenize(io.BytesIO(string.encode('utf-8')).readline)
pre_op = ''
for toktype, tokval, _, _, _ in g:
if toktype == tokenize.NAME and pre_op == '.' and tokval.startswith('_'):
attr = tokval
msg = "access to attribute '{0}' is unsafe.".format(attr)
raise AttributeError(msg)
elif toktype == tokenize.OP:
pre_op = tokval
if __name__ == '__main__':
spider = Spider()
spider.init()
# spider.init_api_ext_file() # 生成筛选对应的json文件
spider.log({'key': 'value'})
spider.log('====文本内容====')
with open('test_1.txt', encoding='utf-8') as f:
code = f.read()
a = spider.superStr2dict(code)
print(type(a), a)
# spider.searchContent('斗罗大陆')
print(spider.playerContent(None, 1, None))
with open('ad.m3u8', encoding='utf-8') as f:
adt = f.read()
url = adt.split('\n')[0]
adt = '\n'.join(adt.split('\n')[1:])
ad_remove = 'reg:/video/adjump(.*?)ts'
print(spider.fixAdM3u8(adt, url, ad_remove))