drpy-node/spider/jstest/ocr_check_sources.js at f485f7ea29b2101e9d90f3189a1ffc62bd9d5490 · hjdhnx/drpy-node · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import Tesseract from 'tesseract.js';
import path from 'path';
import fs from 'fs';
import {fileURLToPath} from 'url';
import axios from 'axios';

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const JS_DIR = path.resolve(__dirname, '../js');
const OUTPUT_FILE = path.resolve(__dirname, 'final_sources.json');

// Get image path from command line arg or default
const imageArg = process.argv[2];
const IMAGE_PATH = imageArg ? path.resolve(process.cwd(), imageArg) : path.resolve(__dirname, '待识别.jpg');

// 1. OCR 识别部分
async function performOCR() {
    console.log(`Starting OCR on ${IMAGE_PATH}...`);

    if (!fs.existsSync(IMAGE_PATH)) {
        console.error(`Error: Image file not found at ${IMAGE_PATH}`);
        process.exit(1);
    }

    try {
        const worker = await Tesseract.createWorker('chi_sim', 1, {
            langPath: 'https://cdn.jsdelivr.net/gh/naptha/tessdata@gh-pages/4.0.0',
            logger: m => {
                if (m.status === 'recognizing text') {
                    process.stdout.write(`\rOCR Progress: ${(m.progress * 100).toFixed(0)}%`);
                }
            }
        });

        console.log('\nRecognizing text...');
        const result = await worker.recognize(IMAGE_PATH);
        const {text} = result.data;

        const lines = text.split('\n').map(l => ({text: l}));

        await worker.terminate();

        console.log('\nOCR Complete. Raw text length:', text.length);
        return lines || [];
    } catch (error) {
        console.error('\nOCR Failed:', error);
        return [];
    }
}

// 2. 数据清洗与提取
function parseLines(lines) {
    const candidates = [];
    let bufferName = '';

    // 正则：匹配 http 或 https 开头的 URL
    const urlRegex = /(https?:\/\/[a-zA-Z0-9\.\-\_\/\?\&]+)/;

    for (let i = 0; i < lines.length; i++) {
        const lineText = lines[i].text.trim();
        if (!lineText) continue;

        const urlMatch = lineText.match(urlRegex);

        if (urlMatch) {
            const url = urlMatch[1];
            // 如果同一行还有其他文本（且不是简单的符号），可能就是名称
            let name = lineText.replace(url, '').trim();

            // 清理名称中的常见干扰字符
            name = name.replace(/^[.\-_|:：\s]+/, '').replace(/[.\-_|:：\s]+$/, '');

            if (name.length > 1) {
                // 同一行有名称
                candidates.push({name, url});
                bufferName = ''; // 清空 buffer
            } else if (bufferName) {
                // 使用上一行的 buffer 作为名称
                candidates.push({name: bufferName, url});
                bufferName = '';
            } else {
                // 既没有 buffer 也没有同行名称，暂时用域名当名称
                try {
                    const u = new URL(url);
                    candidates.push({name: u.hostname, url});
                } catch (e) {
                    candidates.push({name: 'Unknown', url});
                }
            }
        } else {
            // 如果不是 URL 行，假设它是名称，存入 buffer
            // 忽略太短的行或者看起来像垃圾字符的行
            if (lineText.length > 1 && !/^[.\-_|:：]+$/.test(lineText)) {
                bufferName = lineText;
            }
        }
    }

    return candidates;
}

// 3. 辅助函数
function normalizeUrl(url) {
    if (!url) return '';
    try {
        const u = new URL(url);
        // Normalize: remove www., trailing slash, protocol
        return u.hostname.toLowerCase().replace(/^www\./, '');
    } catch (e) {
        return url.toLowerCase().replace(/^https?:\/\//, '').replace(/^www\./, '');
    }
}

async function getExistingHosts() {
    const hosts = new Set();

    // 1. Check JS folder
    if (fs.existsSync(JS_DIR)) {
        const files = fs.readdirSync(JS_DIR).filter(f => f.endsWith('.js'));
        const hostRegex = /host\s*:\s*['"]([^'"]+)['"]/;
        for (const file of files) {
            const content = fs.readFileSync(path.join(JS_DIR, file), 'utf-8');
            const hostMatch = content.match(hostRegex);
            if (hostMatch) {
                hosts.add(normalizeUrl(hostMatch[1]));
            }
        }
    }

    // 2. Check existing final_sources.json
    if (fs.existsSync(OUTPUT_FILE)) {
        try {
            const data = JSON.parse(fs.readFileSync(OUTPUT_FILE, 'utf-8'));
            if (Array.isArray(data)) {
                data.forEach(item => {
                    if (item.url) hosts.add(normalizeUrl(item.url));
                });
            }
        } catch (e) {
            console.warn('Warning: Could not parse existing final_sources.json');
        }
    }

    return hosts;
}

async function checkUrl(url) {
    try {
        await axios.get(url, {
            timeout: 5000,
            maxRedirects: 2,
            headers: {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
            },
            validateStatus: (status) => status < 400
        });
        return true;
    } catch (e) {
        // console.log(`Debug: Check failed for ${url} - ${e.message}`);
        return false;
    }
}

// 4. 主流程
async function main() {
    // A. 执行 OCR
    const lines = await performOCR();
    const candidates = parseLines(lines);

    console.log(`Extracted ${candidates.length} potential sources from image.`);

    if (candidates.length === 0) {
        console.log('No sources extracted. Exiting.');
        return;
    }

    // B. 对比现有源
    console.log('Scanning existing sources...');
    const existingHosts = await getExistingHosts();
    console.log(`Found ${existingHosts.size} existing hosts (in project + json).`);

    console.log('Comparing and validating candidates...');
    const newValidSources = [];

    // Load existing data to append to
    let finalSources = [];
    if (fs.existsSync(OUTPUT_FILE)) {
        try {
            finalSources = JSON.parse(fs.readFileSync(OUTPUT_FILE, 'utf-8'));
        } catch (e) {
        }
    }

    for (const cand of candidates) {
        // 清理 URL 结尾可能的 OCR 错误（如多余的点）
        let cleanUrl = cand.url.replace(/[.\s]+$/, '');
        // 补全协议
        if (!cleanUrl.startsWith('http')) cleanUrl = 'http://' + cleanUrl;

        const normUrl = normalizeUrl(cleanUrl);

        // Check if already exists
        if (existingHosts.has(normUrl)) {
            console.log(`[SKIP] ${cand.name} (${cleanUrl}) - Already exists.`);
            continue;
        }

        // Check if we already added it in this run (deduplication)
        if (newValidSources.find(s => normalizeUrl(s.url) === normUrl)) {
            continue;
        }

        process.stdout.write(`Checking ${cand.name} (${cleanUrl})... `);
        const isValid = await checkUrl(cleanUrl);
        if (isValid) {
            console.log('VALID');
            const sourceObj = {name: cand.name, url: cleanUrl};
            newValidSources.push(sourceObj);
            finalSources.push(sourceObj); // Add to final list
            existingHosts.add(normUrl); // Add to set to prevent dupes in same run
        } else {
            console.log('INVALID/TIMEOUT');
        }
    }

    console.log('\n=== NEW VALID SOURCES ADDED ===');
    console.log(JSON.stringify(newValidSources, null, 2));

    // Save to file
    fs.writeFileSync(OUTPUT_FILE, JSON.stringify(finalSources, null, 2));
    console.log(`\nUpdated ${OUTPUT_FILE} with ${newValidSources.length} new sources.`);
    console.log(`Total sources in file: ${finalSources.length}`);
}

main();