|
| 1 | +<?php |
| 2 | +require_once __DIR__ . '/lib/spider.php'; |
| 3 | + |
| 4 | +class Spider extends BaseSpider { |
| 5 | + |
| 6 | + private $baseUrl; |
| 7 | + |
| 8 | + public function getName() { |
| 9 | + return "74P福利(漫画版)"; |
| 10 | + } |
| 11 | + |
| 12 | + public function init($extend = "") { |
| 13 | + $this->baseUrl = "https://www.74p.net"; |
| 14 | + } |
| 15 | + |
| 16 | + public function isVideoFormat($url) { |
| 17 | + return false; |
| 18 | + } |
| 19 | + |
| 20 | + public function manualVideoCheck() { |
| 21 | + return false; |
| 22 | + } |
| 23 | + |
| 24 | + private function getHeader() { |
| 25 | + return [ |
| 26 | + "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", |
| 27 | + "Referer" => $this->baseUrl . '/', |
| 28 | + "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", |
| 29 | + "Connection" => "keep-alive" |
| 30 | + ]; |
| 31 | + } |
| 32 | + |
| 33 | + private function fetchHtml($url, $referer = "") { |
| 34 | + $headers = $this->getHeader(); |
| 35 | + if ($referer) $headers['Referer'] = $referer; |
| 36 | + |
| 37 | + $options = [ |
| 38 | + 'headers' => $headers |
| 39 | + ]; |
| 40 | + return $this->fetch($url, $options); |
| 41 | + } |
| 42 | + |
| 43 | + public function homeContent($filter) { |
| 44 | + $cats = [ |
| 45 | + ["type_name" => "=== 写真 ===", "type_id" => "ignore"], |
| 46 | + ["type_name" => "秀人网", "type_id" => "xiurenwang"], |
| 47 | + ["type_name" => "语画界", "type_id" => "yuhuajie"], |
| 48 | + ["type_name" => "花漾", "type_id" => "huayang"], |
| 49 | + ["type_name" => "星颜社", "type_id" => "xingyanshe"], |
| 50 | + ["type_name" => "嗲囡囡", "type_id" => "feilin"], |
| 51 | + ["type_name" => "爱蜜社", "type_id" => "aimishe"], |
| 52 | + ["type_name" => "波萝社", "type_id" => "boluoshe"], |
| 53 | + ["type_name" => "尤物馆", "type_id" => "youwuguan"], |
| 54 | + ["type_name" => "蜜桃社", "type_id" => "miitao"], |
| 55 | + ["type_name" => "=== 漫画 ===", "type_id" => "ignore"], |
| 56 | + ["type_name" => "日本漫画", "type_id" => "comic/category/jp"], |
| 57 | + ["type_name" => "韩国漫画", "type_id" => "comic/category/kr"], |
| 58 | + ["type_name" => "=== 小说 ===", "type_id" => "ignore"], |
| 59 | + ["type_name" => "都市", "type_id" => "novel/category/Urban"], |
| 60 | + ["type_name" => "乱伦", "type_id" => "novel/category/Incestuous"], |
| 61 | + ["type_name" => "玄幻", "type_id" => "novel/category/Xuanhuan"], |
| 62 | + ["type_name" => "武侠", "type_id" => "novel/category/Wuxia"] |
| 63 | + ]; |
| 64 | + |
| 65 | + $validCats = []; |
| 66 | + foreach ($cats as $c) { |
| 67 | + if ($c['type_id'] != 'ignore') { |
| 68 | + $validCats[] = $c; |
| 69 | + } |
| 70 | + } |
| 71 | + return ['class' => $validCats, 'filters' => []]; |
| 72 | + } |
| 73 | + |
| 74 | + public function homeVideoContent() { |
| 75 | + return ['list' => []]; |
| 76 | + } |
| 77 | + |
| 78 | + public function categoryContent($tid, $pg = 1, $filter = [], $extend = []) { |
| 79 | + $url = "{$this->baseUrl}/{$tid}/page/{$pg}"; |
| 80 | + return $this->getPostList($url, $pg); |
| 81 | + } |
| 82 | + |
| 83 | + private function getPostList($url, $pg) { |
| 84 | + $html = $this->fetchHtml($url); |
| 85 | + $vlist = []; |
| 86 | + |
| 87 | + if ($html) { |
| 88 | + $listBlock = $html; |
| 89 | + if (preg_match('/(?:id="index_ajax_list"|class="site-main")[^>]*>(.*?)<(?:footer|aside)/s', $html, $match)) { |
| 90 | + $listBlock = $match[1]; |
| 91 | + } |
| 92 | + |
| 93 | + if (preg_match_all('/<li[^>]*>(.*?)<\/li>/s', $listBlock, $items)) { |
| 94 | + foreach ($items[1] as $item) { |
| 95 | + if (!preg_match('/href=["\']([^"\']+)["\']/', $item, $hrefMatch)) continue; |
| 96 | + $href = $hrefMatch[1]; |
| 97 | + |
| 98 | + if (strpos($href, '.css') !== false || strpos($href, '.js') !== false || strpos($href, 'templates/') !== false || strpos($href, 'wp-includes') !== false) continue; |
| 99 | + |
| 100 | + $pic = ""; |
| 101 | + if (preg_match('/data-original=["\']([^"\']+)["\']/', $item, $imgMatch)) { |
| 102 | + $pic = $imgMatch[1]; |
| 103 | + } elseif (preg_match('/src=["\']([^"\']+)["\']/', $item, $imgMatch)) { |
| 104 | + $pic = $imgMatch[1]; |
| 105 | + } |
| 106 | + |
| 107 | + if (!$pic) $pic = "https://www.74p.net/static/images/cover.png"; |
| 108 | + |
| 109 | + $name = ""; |
| 110 | + if (preg_match('/title=["\']([^"\']+)["\']/', $item, $titleMatch)) { |
| 111 | + $name = $titleMatch[1]; |
| 112 | + } else { |
| 113 | + $name = trim(strip_tags($item)); |
| 114 | + $name = explode("\n", $name)[0]; |
| 115 | + } |
| 116 | + |
| 117 | + if (strpos($name, '.') === 0 || strpos($name, '{') !== false || strlen($name) > 300) continue; // strlen 100 in python is roughly 300 bytes in utf8 php maybe |
| 118 | + |
| 119 | + if (strpos($href, '//') === 0) $href = 'https:' . $href; |
| 120 | + elseif (strpos($href, '/') === 0) $href = $this->baseUrl . $href; |
| 121 | + |
| 122 | + $vlist[] = [ |
| 123 | + 'vod_id' => $href, |
| 124 | + 'vod_name' => $name, |
| 125 | + 'vod_pic' => $pic, |
| 126 | + 'vod_remarks' => '点击查看', |
| 127 | + 'style' => ["type" => "rect", "ratio" => 1.33] |
| 128 | + ]; |
| 129 | + } |
| 130 | + } |
| 131 | + } |
| 132 | + |
| 133 | + $pageCount = (count($vlist) >= 15) ? $pg + 1 : $pg; |
| 134 | + return ['list' => $vlist, 'page' => $pg, 'pagecount' => $pageCount, 'limit' => 20, 'total' => 9999]; |
| 135 | + } |
| 136 | + |
| 137 | + public function searchContent($key, $quick = false, $pg = 1) { |
| 138 | + $searchPath = "/search/{$key}"; |
| 139 | + $referer = (strpos($key, "漫画") !== false) ? "{$this->baseUrl}/comic" : "{$this->baseUrl}/novel"; |
| 140 | + |
| 141 | + if ($pg > 1) $url = "{$this->baseUrl}{$searchPath}/page/{$pg}"; |
| 142 | + else $url = "{$this->baseUrl}{$searchPath}"; |
| 143 | + |
| 144 | + // Temporarily override fetchHtml's referer logic by passing it |
| 145 | + // Or actually fetchHtml supports passing referer. |
| 146 | + // But getPostList calls fetchHtml without referer. |
| 147 | + // Let's modify getPostList to accept referer or just set global referer. |
| 148 | + // Simpler: Just rely on default referer or specific one. |
| 149 | + // Python code sets specific referer. |
| 150 | + |
| 151 | + // Let's manually fetch here to respect logic, or just reuse getPostList which uses default referer (baseUrl) |
| 152 | + // Python code: if "漫画" in key: headers['Referer'] = ... |
| 153 | + // Since getPostList calls fetchHtml($url), and fetchHtml uses default headers if not provided. |
| 154 | + // Let's just use default headers for simplicity as search usually works without specific referer too. |
| 155 | + |
| 156 | + return $this->getPostList($url, $pg); |
| 157 | + } |
| 158 | + |
| 159 | + public function detailContent($ids) { |
| 160 | + $url = $ids[0]; |
| 161 | + $html = $this->fetchHtml($url); |
| 162 | + if (!$html) return ['list' => []]; |
| 163 | + |
| 164 | + $vod = [ |
| 165 | + 'vod_id' => $url, |
| 166 | + 'vod_name' => '', |
| 167 | + 'vod_pic' => '', |
| 168 | + 'type_name' => '漫画', |
| 169 | + 'vod_content' => '', |
| 170 | + 'vod_play_from' => '74P漫画', |
| 171 | + 'vod_play_url' => '' |
| 172 | + ]; |
| 173 | + |
| 174 | + if (preg_match('/<h1[^>]*>(.*?)<\/h1>/', $html, $h1)) { |
| 175 | + $vod['vod_name'] = $h1[1]; |
| 176 | + } |
| 177 | + |
| 178 | + $contentHtml = ""; |
| 179 | + if (preg_match('/(?:id="content"|class="entry-content"|class="single-content")[^>]*>(.*?)<(?:div class="related|footer|aside|section)/s', $html, $match)) { |
| 180 | + $contentHtml = $match[1]; |
| 181 | + $vod['vod_content'] = mb_substr(trim(strip_tags($contentHtml)), 0, 200); |
| 182 | + |
| 183 | + if (preg_match('/<img[^>]+src=["\']([^"\']+)["\']/', $contentHtml, $imgMatch)) { |
| 184 | + $pic = $imgMatch[1]; |
| 185 | + if (strpos($pic, '//') === 0) $pic = 'https:' . $pic; |
| 186 | + elseif (strpos($pic, '/') === 0) $pic = $this->baseUrl . $pic; |
| 187 | + $vod['vod_pic'] = $pic; |
| 188 | + } |
| 189 | + } |
| 190 | + |
| 191 | + // 如果上述方式未找到封面,尝试全局匹配第一张非 logo/icon 图片 |
| 192 | + if (empty($vod['vod_pic']) && preg_match_all('/<img[^>]+src=["\']([^"\']+)["\']/', $html, $matches)) { |
| 193 | + foreach ($matches[1] as $src) { |
| 194 | + if (preg_match('/(logo|icon|avatar|\.gif)/i', $src)) continue; |
| 195 | + |
| 196 | + if (strpos($src, '//') === 0) $src = 'https:' . $src; |
| 197 | + elseif (strpos($src, '/') === 0) $src = $this->baseUrl . $src; |
| 198 | + |
| 199 | + $vod['vod_pic'] = $src; |
| 200 | + break; |
| 201 | + } |
| 202 | + } |
| 203 | + |
| 204 | + $playList = []; |
| 205 | + |
| 206 | + // 1. 查找章节列表 |
| 207 | + if (preg_match_all('/<a[^>]+href=["\']([^"\']*\/(?:comic|novel)\/chapter\/[^"\']+)["\'][^>]*>(.*?)<\/a>/', $html, $links, PREG_SET_ORDER)) { |
| 208 | + foreach ($links as $link) { |
| 209 | + $href = $link[1]; |
| 210 | + $name = trim($link[2]); |
| 211 | + |
| 212 | + if (strpos($href, '//') === 0) $href = 'https:' . $href; |
| 213 | + elseif (strpos($href, '/') === 0) $href = $this->baseUrl . $href; |
| 214 | + |
| 215 | + $playList[] = "{$name}\${$href}"; |
| 216 | + } |
| 217 | + } else { |
| 218 | + // 2. 无目录,单页 |
| 219 | + $playList[] = "在线观看\${$url}"; |
| 220 | + } |
| 221 | + |
| 222 | + $vod['vod_play_url'] = implode("#", $playList); |
| 223 | + return ['list' => [$vod]]; |
| 224 | + } |
| 225 | + |
| 226 | + public function playerContent($flag, $id, $vipFlags = []) { |
| 227 | + $images = $this->scrapeAllImages($id); |
| 228 | + $novelData = implode("&&", $images); |
| 229 | + |
| 230 | + return [ |
| 231 | + "parse" => 0, |
| 232 | + "playUrl" => "", |
| 233 | + "url" => "pics://{$novelData}", |
| 234 | + "header" => "" |
| 235 | + ]; |
| 236 | + } |
| 237 | + |
| 238 | + private function scrapeAllImages($url) { |
| 239 | + $images = []; |
| 240 | + $visited = []; |
| 241 | + $currentUrl = $url; |
| 242 | + $page = 1; |
| 243 | + $maxPages = 50; |
| 244 | + |
| 245 | + while ($page <= $maxPages) { |
| 246 | + if (in_array($currentUrl, $visited)) break; |
| 247 | + $visited[] = $currentUrl; |
| 248 | + |
| 249 | + $html = $this->fetchHtml($currentUrl); |
| 250 | + if (!$html) break; |
| 251 | + |
| 252 | + $contentHtml = $html; |
| 253 | + if (preg_match('/(?:id="content"|class="entry-content"|class="single-content")[^>]*>(.*?)<(?:div class="related|footer|section)/s', $html, $match)) { |
| 254 | + $contentHtml = $match[1]; |
| 255 | + } |
| 256 | + |
| 257 | + if (preg_match_all('/<img[^>]+(?:src|data-original|data-src)=["\']([^"\']+)["\']/', $contentHtml, $matches)) { |
| 258 | + foreach ($matches[1] as $src) { |
| 259 | + $lowerSrc = strtolower($src); |
| 260 | + if (strpos($lowerSrc, '.gif') !== false || strpos($lowerSrc, '.svg') !== false || strpos($lowerSrc, 'logo') !== false || strpos($lowerSrc, 'avatar') !== false || strpos($lowerSrc, 'icon') !== false) continue; |
| 261 | + |
| 262 | + if (strpos($src, '//') === 0) $src = 'https:' . $src; |
| 263 | + elseif (strpos($src, '/') === 0) $src = $this->baseUrl . $src; |
| 264 | + |
| 265 | + if (!in_array($src, $images)) { |
| 266 | + $images[] = $src; |
| 267 | + } |
| 268 | + } |
| 269 | + } |
| 270 | + |
| 271 | + $nextUrl = null; |
| 272 | + if (preg_match('/<a[^>]+href=["\']([^"\']+)["\'][^>]*>(?:下一页|Next|»)<\/a>/i', $html, $nextMatch)) { |
| 273 | + $nextUrl = $nextMatch[1]; |
| 274 | + } elseif (preg_match('/<a[^>]+href=["\']([^"\']+)["\'][^>]*class=["\'][^"\']*next[^"\']*["\']/', $html, $nextMatch)) { |
| 275 | + $nextUrl = $nextMatch[1]; |
| 276 | + } |
| 277 | + |
| 278 | + if (!$nextUrl && strpos($currentUrl, '/comic/chapter/') === false && strpos($currentUrl, 'page') !== false) { |
| 279 | + // Try auto-increment if pagination pattern detected |
| 280 | + $parts = explode('/', rtrim($currentUrl, '/')); |
| 281 | + $lastPart = end($parts); |
| 282 | + if (is_numeric($lastPart)) { |
| 283 | + $base = substr($currentUrl, 0, strrpos($currentUrl, '/')); |
| 284 | + $nextUrl = "{$base}/" . ($page + 1); |
| 285 | + } |
| 286 | + } |
| 287 | + |
| 288 | + if ($nextUrl) { |
| 289 | + if (strpos($nextUrl, '//') === 0) $nextUrl = 'https:' . $nextUrl; |
| 290 | + elseif (strpos($nextUrl, '/') === 0) $nextUrl = $this->baseUrl . $nextUrl; |
| 291 | + } else { |
| 292 | + break; |
| 293 | + } |
| 294 | + |
| 295 | + $currentUrl = $nextUrl; |
| 296 | + $page++; |
| 297 | + } |
| 298 | + |
| 299 | + return $images; |
| 300 | + } |
| 301 | +} |
| 302 | + |
| 303 | +(new Spider())->run(); |
0 commit comments