cpubro/scrapers/cpubenchmark.js

const puppeteer = require('puppeteer');

// Extract CPU model identifier with enough context to avoid false matches
function extractCPUModel(name) {
  // Intel Core Ultra patterns: Core Ultra 5/7/9 XXXX
  const ultraMatch = name.match(/\b(Core\s+Ultra\s+[579]\s+\w+[A-Z]?)\b/i);
  if (ultraMatch) return ultraMatch[1];

  // Intel patterns: Core i3-XXXX, Core i5-XXXX, etc. (include "Core" for specificity)
  // Handle both "Core i5-14400F" and "Core i5 14400F" (with or without dash)
  const intelMatch = name.match(/\b(Core\s+i[3579])-?\s*(\w+[A-Z]?)\b/i);
  if (intelMatch) return `${intelMatch[1]}-${intelMatch[2]}`;

  // Intel Xeon patterns: Xeon Silver/Gold/Platinum XXXX
  const xeonMatch = name.match(/\b(Xeon\s+(?:Silver|Gold|Platinum|Bronze)?\s*\w+[A-Z]?)\b/i);
  if (xeonMatch) return xeonMatch[1];

  // AMD Ryzen patterns: Ryzen X XXXXX (include series number for specificity)
  const ryzenMatch = name.match(/\b(Ryzen\s+[3579]\s+\w+)/i);
  if (ryzenMatch) return ryzenMatch[1];

  // AMD Threadripper patterns: Threadripper XXXX
  const threadripperMatch = name.match(/\b(Threadripper\s+\w+)/i);
  if (threadripperMatch) return threadripperMatch[1];

  return null;
}

async function fetchBenchmarkData(progressCallback) {
  let browser;
  try {
    progressCallback('Loading CPU benchmark database...');
    browser = await puppeteer.launch({
      headless: 'new',
      args: ['--no-sandbox', '--disable-setuid-sandbox']
    });

    const page = await browser.newPage();
    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');

    progressCallback('Fetching benchmark scores...');
    await page.goto('https://www.cpubenchmark.net/cpu_list.php', {
      waitUntil: 'networkidle2',
      timeout: 60000
    });

    progressCallback('Parsing benchmark data...');

    // Extract benchmark data from the table
    const benchmarks = await page.evaluate(() => {
      const data = {};
      const rows = document.querySelectorAll('table tr');

      rows.forEach(row => {
        const cells = row.querySelectorAll('td');
        if (cells.length >= 2) {
          const cpuName = cells[0]?.textContent?.trim();
          const cpuMark = cells[1]?.textContent?.trim();

          if (cpuName && cpuMark) {
            const score = parseInt(cpuMark.replace(/,/g, ''));
            if (!isNaN(score) && score > 0) {
              data[cpuName] = score;
            }
          }
        }
      });

      return data;
    });

    const count = Object.keys(benchmarks).length;
    progressCallback(`Loaded ${count} CPU benchmarks`);

    await browser.close();
    return benchmarks;

  } catch (error) {
    if (browser) {
      await browser.close();
    }
    progressCallback(`Error fetching benchmarks: ${error.message}`);
    throw error;
  }
}

function findBenchmarkScore(cpuName, benchmarkData) {
  // Try exact match first
  if (benchmarkData[cpuName]) {
    return benchmarkData[cpuName];
  }

  // Extract the CPU model identifier
  const model = extractCPUModel(cpuName);
  if (!model) return null;

  // Search for CPUs containing this model identifier
  const modelLower = model.toLowerCase();

  for (const [benchName, score] of Object.entries(benchmarkData)) {
    const benchLower = benchName.toLowerCase();

    // Check if the benchmark name contains the exact model
    // Use word boundaries to avoid partial matches
    const modelRegex = new RegExp(`\\b${modelLower.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'i');
    if (modelRegex.test(benchLower)) {
      return score;
    }
  }

  return null;
}

module.exports = { fetchBenchmarkData, findBenchmarkScore };