import Tesseract from 'tesseract.js';
import path from 'path';
import fs from 'fs';
import {fileURLToPath} from 'url';
import axios from 'axios';

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const JS_DIR = path.resolve(__dirname, '../js');
const OUTPUT_FILE = path.resolve(__dirname, 'final_sources.json');

// Get image path from command line arg or default
const imageArg = process.argv[2];
const IMAGE_PATH = imageArg ? path.resolve(process.cwd(), imageArg) : path.resolve(__dirname, '待识别.jpg');

// 1. OCR 识别部分
async function performOCR() {
    console.log(`Starting OCR on ${IMAGE_PATH}...`);

    if (!fs.existsSync(IMAGE_PATH)) {
        console.error(`Error: Image file not found at ${IMAGE_PATH}`);
        process.exit(1);
    }

    try {
        const worker = await Tesseract.createWorker('chi_sim', 1, {
            langPath: 'https://cdn.jsdelivr.net/gh/naptha/tessdata@gh-pages/4.0.0',
            logger: m => {
                if (m.status === 'recognizing text') {
                    process.stdout.write(`\rOCR Progress: ${(m.progress * 100).toFixed(0)}%`);
                }
            }
        });

        console.log('\nRecognizing text...');
        const result = await worker.recognize(IMAGE_PATH);
        const {text} = result.data;

        const lines = text.split('\n').map(l => ({text: l}));

        await worker.terminate();

        console.log('\nOCR Complete. Raw text length:', text.length);
        return lines || [];
    } catch (error) {
        console.error('\nOCR Failed:', error);
        return [];
    }
}

// 2. 数据清洗与提取
function parseLines(lines) {
    const candidates = [];
    let bufferName = '';

    // 正则：匹配 http 或 https 开头的 URL
    const urlRegex = /(https?:\/\/[a-zA-Z0-9\.\-\_\/\?\&]+)/;

    for (let i = 0; i < lines.length; i++) {
        const lineText = lines[i].text.trim();
        if (!lineText) continue;

        const urlMatch = lineText.match(urlRegex);

        if (urlMatch) {
            const url = urlMatch[1];
            // 如果同一行还有其他文本（且不是简单的符号），可能就是名称
            let name = lineText.replace(url, '').trim();

            // 清理名称中的常见干扰字符
            name = name.replace(/^[.\-_|:：\s]+/, '').replace(/[.\-_|:：\s]+$/, '');

            if (name.length > 1) {
                // 同一行有名称
                candidates.push({name, url});
                bufferName = ''; // 清空 buffer
            } else if (bufferName) {
                // 使用上一行的 buffer 作为名称
                candidates.push({name: bufferName, url});
                bufferName = '';
            } else {
                // 既没有 buffer 也没有同行名称，暂时用域名当名称
                try {
                    const u = new URL(url);
                    candidates.push({name: u.hostname, url});
                } catch (e) {
                    candidates.push({name: 'Unknown', url});
                }
            }
        } else {
            // 如果不是 URL 行，假设它是名称，存入 buffer
            // 忽略太短的行或者看起来像垃圾字符的行
            if (lineText.length > 1 && !/^[.\-_|:：]+$/.test(lineText)) {
                bufferName = lineText;
            }
        }
    }

    return candidates;
}

// 3. 辅助函数
function normalizeUrl(url) {
    if (!url) return '';
    try {
        const u = new URL(url);
        // Normalize: remove www., trailing slash, protocol
        return u.hostname.toLowerCase().replace(/^www\./, '');
    } catch (e) {
        return url.toLowerCase().replace(/^https?:\/\//, '').replace(/^www\./, '');
    }
}

async function getExistingHosts() {
    const hosts = new Set();

    // 1. Check JS folder
    if (fs.existsSync(JS_DIR)) {
        const files = fs.readdirSync(JS_DIR).filter(f => f.endsWith('.js'));
        const hostRegex = /host\s*:\s*['"]([^'"]+)['"]/;
        for (const file of files) {
            const content = fs.readFileSync(path.join(JS_DIR, file), 'utf-8');
            const hostMatch = content.match(hostRegex);
            if (hostMatch) {
                hosts.add(normalizeUrl(hostMatch[1]));
            }
        }
    }

    // 2. Check existing final_sources.json
    if (fs.existsSync(OUTPUT_FILE)) {
        try {
            const data = JSON.parse(fs.readFileSync(OUTPUT_FILE, 'utf-8'));
            if (Array.isArray(data)) {
                data.forEach(item => {
                    if (item.url) hosts.add(normalizeUrl(item.url));
                });
            }
        } catch (e) {
            console.warn('Warning: Could not parse existing final_sources.json');
        }
    }

    return hosts;
}

async function checkUrl(url) {
    try {
        await axios.get(url, {
            timeout: 5000,
            maxRedirects: 2,
            headers: {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
            },
            validateStatus: (status) => status < 400
        });
        return true;
    } catch (e) {
        // console.log(`Debug: Check failed for ${url} - ${e.message}`);
        return false;
    }
}

// 4. 主流程
async function main() {
    // A. 执行 OCR
    const lines = await performOCR();
    const candidates = parseLines(lines);

    console.log(`Extracted ${candidates.length} potential sources from image.`);

    if (candidates.length === 0) {
        console.log('No sources extracted. Exiting.');
        return;
    }

    // B. 对比现有源
    console.log('Scanning existing sources...');
    const existingHosts = await getExistingHosts();
    console.log(`Found ${existingHosts.size} existing hosts (in project + json).`);

    console.log('Comparing and validating candidates...');
    const newValidSources = [];

    // Load existing data to append to
    let finalSources = [];
    if (fs.existsSync(OUTPUT_FILE)) {
        try {
            finalSources = JSON.parse(fs.readFileSync(OUTPUT_FILE, 'utf-8'));
        } catch (e) {
        }
    }

    for (const cand of candidates) {
        // 清理 URL 结尾可能的 OCR 错误（如多余的点）
        let cleanUrl = cand.url.replace(/[.\s]+$/, '');
        // 补全协议
        if (!cleanUrl.startsWith('http')) cleanUrl = 'http://' + cleanUrl;

        const normUrl = normalizeUrl(cleanUrl);

        // Check if already exists
        if (existingHosts.has(normUrl)) {
            console.log(`[SKIP] ${cand.name} (${cleanUrl}) - Already exists.`);
            continue;
        }

        // Check if we already added it in this run (deduplication)
        if (newValidSources.find(s => normalizeUrl(s.url) === normUrl)) {
            continue;
        }

        process.stdout.write(`Checking ${cand.name} (${cleanUrl})... `);
        const isValid = await checkUrl(cleanUrl);
        if (isValid) {
            console.log('VALID');
            const sourceObj = {name: cand.name, url: cleanUrl};
            newValidSources.push(sourceObj);
            finalSources.push(sourceObj); // Add to final list
            existingHosts.add(normUrl); // Add to set to prevent dupes in same run
        } else {
            console.log('INVALID/TIMEOUT');
        }
    }

    console.log('\n=== NEW VALID SOURCES ADDED ===');
    console.log(JSON.stringify(newValidSources, null, 2));

    // Save to file
    fs.writeFileSync(OUTPUT_FILE, JSON.stringify(finalSources, null, 2));
    console.log(`\nUpdated ${OUTPUT_FILE} with ${newValidSources.length} new sources.`);
    console.log(`Total sources in file: ${finalSources.length}`);
}

main();