/*!
* @module htmlParser
* @brief T3解析html处理库
* @version 3.1.0
*
* @original-author hjdhnx
* @original-source {@link https://github.com/hjdhnx/hipy-server/blob/master/app/t4/base/htmlParser.py | Source on GitHub}
*
* @modified-by HiramWong
* @modification-date 2023-04-09T18:31:59+08:00
* @modification-description Python转TypeScript, 适用于JavaScript项目
*/
import * as cheerio from 'cheerio';
import jsonpath from 'jsonpath';
import urlJoin from 'url';
const PARSE_CACHE = true; // 解析缓存
const NOADD_INDEX = ':eq|:lt|:gt|:first|:last|:not|:even|:odd|:has|:contains|:matches|:empty|^body$|^#'; // 不自动加eq下标索引
const URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url|style)$|^(data-|url-|src-)'; // 需要自动urljoin的属性
const SPECIAL_URL = '^(ftp|magnet|thunder|ws):'; // 过滤特殊链接,不走urlJoin
class Jsoup {
MY_URL: string = '';
pdfh_html = '';
pdfa_html = '';
pdfh_doc = null;
pdfa_doc: cheerio.Root | null = null;
// 构造函数
constructor(MY_URL: string = '') {
this.MY_URL = MY_URL;
}
// 测试
test(text: string, string: string): boolean {
const searchObj = new RegExp(text, 'mi').exec(string);
return searchObj ? true : false;
}
// 包含
contains(text: string, match: string): boolean {
return text.indexOf(match) !== -1;
}
/**
* 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
* @param parse: 解析表达式
* @param first: 是否第一个
* @returns {string}
*/
parseHikerToJq(parse: string, first: boolean = false): string {
if (this.contains(parse, '&&')) {
const parses = parse.split('&&'); // 带&&的重新拼接
let new_parses: string[] = []; // 构造新的解析表达式列表
for (let i = 0; i < parses.length; i++) {
const ps_list = parses[i].split(' ');
const ps = ps_list[ps_list.length - 1]; // 如果分割&&后带空格就取最后一个元素
if (!this.test(NOADD_INDEX, ps)) {
if (!first && i >= parses.length - 1) {
// 不传first且遇到最后一个,不用补eq(0)
new_parses.push(parses[i]);
} else {
new_parses.push(`${parses[i]}:eq(0)`);
}
} else {
new_parses.push(parses[i]);
}
}
parse = new_parses.join(' ');
} else {
const ps_list = parse.split(' ');
const ps = ps_list[ps_list.length - 1]; // 如果带空格就取最后一个元素
if (!this.test(NOADD_INDEX, ps) && first) {
parse = `${parse}:eq(0)`;
}
}
return parse;
}
/**
* 根据传入的单规则获取 parse规则, 索引位置,排除列表 -- 可以用于剔除元素,支持多个, 按标签剔除, 按id剔除等操作
* @param nparse
* @returns {rule: string, index: number, excludes: string[]}
*/
getParseInfo(nparse: string): { nparse_rule: string; nparse_index: number; excludes: string[] } {
let excludes: string[] = []; // 定义排除列表默认值为空
let nparse_index: number = 0; // 定义位置索引默认值为0
let nparse_rule: string = nparse; // 定义规则默认值为本身
if (this.contains(nparse, ':eq')) {
nparse_rule = nparse.split(':eq')[0];
let nparse_pos = nparse.split(':eq')[1];
if (this.contains(nparse_rule, '--')) {
excludes = nparse_rule.split('--').slice(1);
nparse_rule = nparse_rule.split('--')[0];
} else if (this.contains(nparse_pos, '--')) {
excludes = nparse_pos.split('--').slice(1);
nparse_pos = nparse_pos.split('--')[0];
}
try {
nparse_index = parseInt(nparse_pos.split('(')[1].split(')')[0]);
} catch {
}
} else if (this.contains(nparse, '--')) {
nparse_rule = nparse.split('--')[0];
excludes = nparse.split('--').slice(1);
}
return { nparse_rule, nparse_index, excludes };
}
/**
* 解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret
* @param doc: cheerio.load() load后的dom对象
* @param nparse: 解析表达式
* @param ret: 当前返回值
* @returns {Cheerio}
*/
parseOneRule(doc, nparse: string, ret) {
const { nparse_rule, nparse_index, excludes } = this.getParseInfo(nparse);
if (!ret) ret = doc(nparse_rule);
else ret = ret.find(nparse_rule);
if (this.contains(nparse, ':eq')) ret = ret.eq(nparse_index);
if (excludes.length > 0 && ret) {
ret = ret.clone(); // 克隆一个,避免直接remove影响原始DOM
// ret = ret.toArray().map(element => doc(element));
for (let exclude of excludes) {
ret.find(exclude).remove();
}
}
return ret;
}
/**
* 解析空格分割后的原生表达式,返回处理后的ret
* https://pyquery.readthedocs.io/en/latest/api.html
* @param html
* @param parse
* @returns {Cheerio}
*/
pdfa(html: string, parse: string): string[] {
if (!html || !parse) return [];
parse = this.parseHikerToJq(parse);
const doc = cheerio.load(html);
if (PARSE_CACHE) {
if (this.pdfa_html !== html) {
this.pdfa_html = html;
this.pdfa_doc = doc;
}
}
const parses = parse.split(' ');
let ret: cheerio.Cheerio | null = null;
for (const nparse of parses) {
ret = this.parseOneRule(doc, nparse, ret);
if (!ret) return [];
}
const res: string[] = (ret?.toArray() ?? []).map((item: any) => {
const res_html = `${doc(item)}`; // outerHTML()
// const res_html = doc(item).html(); // innerHTML()
return res_html ? res_html : ''; // 空值检查,将 null 值转换为空字符串
});
return res;
}
pdfl(html: string, parse: string, list_text: string, list_url: string, url_key: string): string[] {
if (!html || !parse) return [];
parse = this.parseHikerToJq(parse, false);
const new_vod_list: any = [];
const doc = cheerio.load(html);
const parses: string[] = parse.split(' ');
let ret: cheerio.Cheerio | null = null;
for (const pars of parses) {
ret = this.parseOneRule(doc, pars, ret);
if (!ret) return [];
}
ret!.each((_, element) => {
new_vod_list.push(`${doc(element)}`); // outerHTML()
// new_vod_list.push(doc(element).html()); // innerHTML()
});
return new_vod_list;
}
/**
* 解析空格分割后的原生表达式,返回处理后的ret
* https://pyquery.readthedocs.io/en/latest/api.html
* @param html
* @param parse
* @returns {Cheerio}
*/
pdfh(html: string, parse: string, baseUrl: string = ''): string {
if (!html || !parse) return '';
const doc: cheerio.Root = cheerio.load(html);
if (PARSE_CACHE) {
if (this.pdfa_html !== html) {
this.pdfa_html = html;
this.pdfa_doc = doc;
}
}
if (parse == 'body&&Text' || parse == 'Text') {
//@ts-ignore
return doc.text();
} else if (parse == 'body&&Html' || parse == 'Html') {
return doc.html();
}
let option: string | undefined;
if (this.contains(parse, '&&')) {
const parts: string[] = parse.split('&&');
option = parts[parts.length - 1];
parse = parts.slice(0, -1).join('&&');
}
parse = this.parseHikerToJq(parse, true);
const parses: string[] = parse.split(' ');
let ret: string | cheerio.Cheerio | null = null;
for (const nparse of parses) {
ret = this.parseOneRule(doc, nparse, ret);
if (!ret) return '';
}
if (option) {
switch (option) {
case 'Text':
ret = (ret as cheerio.Cheerio)?.text() || '';
break;
case 'Html':
ret = (ret as cheerio.Cheerio)?.html() || '';
break;
default:
// 保留原来的ret
let original_ret = (ret as cheerio.Cheerio)?.clone();
let options = option.split('||');
let opt_index = 0;
for (let opt of options) {
// console.log(`opt_index:${opt_index},opt:${opt}`);
opt_index += 1;
ret = original_ret?.attr(opt) || '';
// console.log('ret:', ret);
if (this.contains(opt.toLowerCase(), 'style') && this.contains(ret, 'url(')) {
try {
ret = ret.match(/url\((.*?)\)/)![1];
// 2023/07/28新增 style取内部链接自动去除首尾单双引号
ret = ret.replace(/^['"]|['"]$/g, '');
} catch {
}
}
if (ret && baseUrl) {
const needAdd = this.test(URLJOIN_ATTR, opt) && !this.test(SPECIAL_URL, ret);
if (needAdd) {
if (ret.includes('http')) {
ret = ret.slice(ret.indexOf('http'));
} else {
ret = urlJoin.resolve(baseUrl, ret);
}
}
}
if (ret) {
break;
}
}
}
} else {
ret = `${ret}`;
}
return ret;
}
pd(html: string, parse: string, baseUrl: string = ''): string {
if (!baseUrl) baseUrl = this.MY_URL;
return this.pdfh(html, parse, baseUrl);
}
pq(html: string) {
return cheerio.load(html);
}
pjfh(html: any, parse: string, addUrl = false): string {
if (!html || !parse) return '';
try {
html = typeof html === 'string' ? JSON.parse(html) : html;
} catch {
console.log('字符串转json失败');
return '';
}
if (!parse.startsWith('$.')) {
parse = '$.' + parse;
}
let ret = '';
const paths = parse.split('||');
for (const path of paths) {
const queryResult = jsonpath.query(html, path);
if (Array.isArray(queryResult)) ret = queryResult[0] ? `${queryResult[0]}` : '';
else ret = queryResult ? `${queryResult}` : '';
if (addUrl && ret) {
ret = urlJoin.resolve(this.MY_URL, ret);
}
if (ret) break;
}
return ret;
}
pj(html: any, parse: string): string {
return this.pjfh(html, parse, true);
}
pjfa(html: any, parse: string): any[] {
if (!html || !parse) return [];
try {
html = typeof html === 'string' ? JSON.parse(html) : html;
} catch {
return [];
}
if (!parse.startsWith('$.')) parse = '$.' + parse;
const result = jsonpath.query(html, parse);
if (Array.isArray(result) && Array.isArray(result[0]) && result.length === 1) {
return result[0]; // 自动解包
}
return result || [];
}
}
export default Jsoup;