package com; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.net.MalformedURLException; import java.net.URL; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class HtmlParser { private static String pdfh_html = ""; private static String pdfa_html = ""; private static final Pattern p = Pattern.compile("url\\((.*?)\\)", Pattern.MULTILINE | Pattern.DOTALL); private static final Pattern NOADD_INDEX = Pattern .compile(":eq|:lt|:gt|:first|:last|:not|:even|:odd|:has|:contains|:matches|:empty|^body$|^#"); // 不自动加eq下标索引 private static final Pattern URLJOIN_ATTR = Pattern.compile("(url|src|href|-original|-src|-play|-url|style)$|^(data-|url-|src-)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); // 需要自动urljoin的属性 private static final Pattern SPECIAL_URL = Pattern.compile("^(ftp|magnet|thunder|ws):", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); // 过滤特殊链接,不走urlJoin private static Document pdfh_doc = null; private static Document pdfa_doc = null; public static String join(CharSequence delimiter, @SuppressWarnings("rawtypes") Iterable tokens) { final Iterator it = tokens.iterator(); if (!it.hasNext()) { return ""; } final StringBuilder sb = new StringBuilder(); sb.append(it.next()); while (it.hasNext()) { sb.append(delimiter); sb.append(it.next()); } return sb.toString(); } public static String join(CharSequence delimiter, Object[] tokens) { final int length = tokens.length; if (length == 0) { return ""; } final StringBuilder sb = new StringBuilder(); sb.append(tokens[0]); for (int i = 1; i < length; i++) { sb.append(delimiter); sb.append(tokens[i]); } return sb.toString(); } public static String joinUrl(String parent, String child) { if (parent.isEmpty()) { return child; } URL url; String q = parent; try { url = new URL(new URL(parent), child); q = url.toExternalForm(); } catch (MalformedURLException e) { e.printStackTrace(); } // if (q.contains("#")) { // q = q.replaceAll("^(.+?)#.*?$", "$1"); // } return q; } public static class Painfo { public String nparse_rule; public int nparse_index; public List excludes; } private static Painfo getParseInfo(String nparse) { /* * 根据传入的单规则获取 parse规则,索引位置,排除列表 -- 可以用于剔除元素,支持多个,按标签剔除,按id剔除等操作 :param nparse: * :return: */ Painfo painfo = new Painfo(); // List excludes = new ArrayList<>(); //定义排除列表默认值为空 // int nparse_index; //定义位置索引默认值为0 painfo.nparse_rule = nparse; // 定义规则默认值为本身 if (nparse.contains(":eq")) { painfo.nparse_rule = nparse.split(":")[0]; String nparse_pos = nparse.split(":")[1]; if (painfo.nparse_rule.contains("--")) { String[] rules = painfo.nparse_rule.split("--"); painfo.excludes = new ArrayList<>(Arrays.asList(rules)); painfo.excludes.remove(0); painfo.nparse_rule = rules[0]; } else if (nparse_pos.contains("--")) { String[] rules = nparse_pos.split("--"); painfo.excludes = new ArrayList<>(Arrays.asList(rules)); painfo.excludes.remove(0); nparse_pos = rules[0]; } try { painfo.nparse_index = Integer.parseInt(nparse_pos.replace("eq(", "").replace(")", "")); } catch (Exception e1) { painfo.nparse_index = 0; } } else { if (nparse.contains("--")) { String[] rules = painfo.nparse_rule.split("--"); painfo.excludes = new ArrayList<>(Arrays.asList(rules)); painfo.excludes.remove(0); painfo.nparse_rule = rules[0]; } } return painfo; } private static String parseHikerToJq(String parse, boolean first) { /* * 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0) :param parse: :param first: * :return: */ // 不自动加eq下标索引 if (parse.contains("&&")) { String[] parses = parse.split("&&"); // 带&&的重新拼接 List new_parses = new ArrayList<>(); // 构造新的解析表达式列表 for (int i = 0; i < parses.length; i++) { String[] pss = parses[i].split(" "); String ps = pss[pss.length - 1]; // 如果分割&&后带空格就取最后一个元素 Matcher m = NOADD_INDEX.matcher(ps); if (!m.find()) { if (!first && i >= parses.length - 1) { // 不传first且遇到最后一个,不用补eq(0) new_parses.add(parses[i]); } else { new_parses.add(parses[i] + ":eq(0)"); } } else { new_parses.add(parses[i]); } } parse = join(" ", new_parses); } else { String[] pss = parse.split(" "); String ps = pss[pss.length - 1]; // 如果分割&&后带空格就取最后一个元素 Matcher m = NOADD_INDEX.matcher(ps); if (!m.find() && first) { parse = parse + ":eq(0)"; } } return parse; } public static String parseDomForUrl(String html, String rule, String add_url) { if (!pdfh_html.equals(html)) { pdfh_html = html; pdfh_doc = Jsoup.parse(html); } Document doc = pdfh_doc; if (rule.equals("body&&Text") || rule.equals("Text")) { return doc.text(); } else if (rule.equals("body&&Html") || rule.equals("Html")) { return doc.html(); } String option = ""; if (rule.contains("&&")) { String[] rs = rule.split("&&"); option = rs[rs.length - 1]; List excludes = new ArrayList<>(Arrays.asList(rs)); excludes.remove(rs.length - 1); rule = join("&&", excludes); } rule = parseHikerToJq(rule, true); String[] parses = rule.split(" "); Elements ret = new Elements(); for (String nparse : parses) { ret = parseOneRule(doc, nparse, ret); if (ret.isEmpty()) { return ""; } } String result = null; if (!option.isEmpty()) { if (option.equals("Text")) { result = ret.text(); } else if (option.equals("Html")) { result = ret.html(); } else { String[] options = option.split("[||]"); for (String opt : options) { result = ret.attr(opt); if (opt.toLowerCase().contains("style") && result.contains("url(")) { Matcher m = p.matcher(result); if (m.find()) { result = m.group(1); } // 2023/07/28新增 style取内部链接自动去除首尾单双引号 result = result.replaceAll("^['|\"](.*)['|\"]$", "$1"); } if (!result.isEmpty() && !add_url.isEmpty()) { // 需要自动urljoin的属性 Matcher m = URLJOIN_ATTR.matcher(opt); Matcher n = SPECIAL_URL.matcher(result); if (m.find() && !n.find()) { if (result.contains("http")) { result = result.substring(result.indexOf("http")); } else { result = joinUrl(add_url, result); } } } if (!result.isEmpty()) { return result; } } } } else { result = ret.outerHtml(); } return result; } public static List parseDomForArray(String html, String rule) { if (!pdfa_html.equals(html)) { pdfa_html = html; pdfa_doc = Jsoup.parse(html); } Document doc = pdfa_doc; rule = parseHikerToJq(rule, false); String[] parses = rule.split(" "); Elements ret = new Elements(); for (String pars : parses) { ret = parseOneRule(doc, pars, ret); if (ret.isEmpty()) { return new ArrayList<>(); } } List eleHtml = new ArrayList<>(); for (int i = 0; i < ret.size(); i++) { Element element1 = ret.get(i); eleHtml.add(element1.outerHtml()); } return eleHtml; } private static Elements parseOneRule(Document doc, String nparse, Elements ret) { Painfo painfo = getParseInfo(nparse); if (ret.isEmpty()) { ret = doc.select(painfo.nparse_rule); } else { ret = ret.select(painfo.nparse_rule); } if (nparse.contains(":eq")) { if (painfo.nparse_index < 0) { ret = ret.eq(ret.size() + painfo.nparse_index); } else { ret = ret.eq(painfo.nparse_index); } } if (painfo.excludes != null && !ret.isEmpty()) { ret = ret.clone(); // 克隆一个, 免得直接remove会影响doc的缓存 for (int i = 0; i < painfo.excludes.size(); i++) { ret.select(painfo.excludes.get(i)).remove(); } } return ret; } public static List parseDomForList(String html, String p1, String list_text, String list_url, String add_url) { if (!pdfa_html.equals(html)) { pdfa_html = html; pdfa_doc = Jsoup.parse(html); } Document doc = pdfa_doc; p1 = parseHikerToJq(p1, false); String[] parses = p1.split(" "); Elements ret = new Elements(); for (String pars : parses) { ret = parseOneRule(doc, pars, ret); if (ret.isEmpty()) { return new ArrayList<>(); } } List new_vod_list = new ArrayList<>(); for (int i = 0; i < ret.size(); i++) { String it = ret.get(i).outerHtml(); new_vod_list.add(parseDomForUrl(it, list_text, "").trim() + '$' + parseDomForUrl(it, list_url, add_url)); } return new_vod_list; } public static void main(String[] args) { XLHttpUtils.Request url = new XLHttpUtils.Request().get().url("https://m.yskanba.com/b-ertu.html"); String string = url.exec().body().string(); System.out.println(string); String html = string; String rule = ".posterPic&&img&&data-original||src"; String ret = HtmlParser.parseDomForUrl(html, rule, ""); System.out.println(ret); rule = ".tabt3&&span:not(:contains(云播tk))"; List rets = HtmlParser.parseDomForArray(html, rule); System.out.println(rets); rule = ".tabt3 span:not(:matches(云播tk))"; rets = HtmlParser.parseDomForArray(html, rule); System.out.println(rets); } }