1
0
Files
jecnarozvrh/scrape/scraper.js
2025-12-20 20:52:08 +01:00

204 lines
6.5 KiB
JavaScript

/*
* Copyright (C) 2025 Jakub Žitník
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
import puppeteer from 'puppeteer';
import path from 'path';
import fs from 'fs';
import parseThisShit from './parse.js';
import 'dotenv/config';
const EMAIL = process.env.EMAIL;
const PASSWORD = process.env.PASSWORD;
const SHAREPOINT_URL = process.env.SHAREPOINT_URL || 'https://spsejecnacz.sharepoint.com/:x:/s/nastenka/ESy19K245Y9BouR5ksciMvgBu3Pn_9EaT0fpP8R6MrkEmg';
const VOLUME_PATH = path.resolve('./volume/browser');
async function clearDownloadsFolder() {
try {
await fs.promises.rm('./downloads', { recursive: true, force: true });
await fs.promises.mkdir('./downloads');
} catch (err) {
console.error('Error:', err);
}
}
async function handleError(page, err) {
try {
const errorsDir = path.resolve('./errors');
if (!fs.existsSync(errorsDir)) fs.mkdirSync(errorsDir);
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
const filePath = path.join(errorsDir, `error-${timestamp}.png`);
await page.screenshot({ path: filePath, fullPage: true });
console.error(`❌ Error occurred. Screenshot saved: ${filePath}`);
// Keep only last 10 screenshots
const files = fs.readdirSync(errorsDir)
.map(f => ({
name: f,
time: fs.statSync(path.join(errorsDir, f)).mtime.getTime()
}))
.sort((a, b) => b.time - a.time);
if (files.length > 10) {
const oldFiles = files.slice(10);
for (const f of oldFiles) {
fs.unlinkSync(path.join(errorsDir, f.name));
}
}
} catch (screenshotErr) {
console.error('Failed to take screenshot:', screenshotErr);
}
console.error(err);
}
(async () => {
let browser, page;
try {
browser = await puppeteer.launch({
headless: 'new',
userDataDir: VOLUME_PATH,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
[page] = await browser.pages();
const downloadPath = path.resolve('./downloads');
if (!fs.existsSync(downloadPath)) fs.mkdirSync(downloadPath);
const client = await page.createCDPSession();
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: downloadPath,
});
await page.goto(SHAREPOINT_URL, { waitUntil: 'networkidle2' });
await new Promise(r => setTimeout(r, 3000));
if (page.url().includes('login.')) {
console.log('Logging in...');
try {
await page.waitForSelector('input[type="email"]', { timeout: 3000 });
await page.type('input[type="email"]', EMAIL, { delay: 50 });
await page.keyboard.press('Enter');
} catch {
try {
await page.waitForSelector(".table");
await page.click('.table');
} catch {}
}
await new Promise(r => setTimeout(r, 2000));
try {
await page.waitForSelector('div[role="button"]', { timeout: 5000 });
const signInButtons = await page.$$('div[role="button"]');
for (const btn of signInButtons) {
const text = await page.evaluate(el => el.innerText, btn);
if (text && text.includes('with password')) {
console.log('Clicking "Sign in with password"...');
await btn.click();
break;
}
}
} catch {
console.log('"Sign in with password" button not found, continuing...');
}
await page.waitForSelector('input[type="password"]', { timeout: 100000 });
await page.type('input[type="password"]', PASSWORD, { delay: 50 });
await page.keyboard.press('Enter');
try {
await page.waitForSelector('input[type="submit"]', { timeout: 10000 });
await page.click('input[type="submit"]');
} catch {
console.log('No stay signed in prompt.');
}
await page.waitForNavigation({ waitUntil: 'networkidle2' });
}
await new Promise(r => setTimeout(r, 5000));
const frameHandle = await page.waitForSelector('iframe');
const frame = await frameHandle.contentFrame();
await frame.waitForSelector('button[title="File"]', { timeout: 60000 });
await frame.click('button[title="File"]');
await new Promise(r => setTimeout(r, 500));
try {
await frame.waitForSelector('div[role="menuitem"][name="Create a Copy"]', { visible: true });
} catch {
await frame.click('button[title="File"]');
await new Promise(r => setTimeout(r, 500));
}
await frame.click('div[role="menuitem"][name="Create a Copy"]');
await new Promise(r => setTimeout(r, 500));
await frame.waitForSelector('div[role="menuitem"][name="Download a Copy"]', { visible: true });
await frame.click('div[role="menuitem"][name="Download a Copy"]');
await new Promise(r => setTimeout(r, 10000));
function waitForFile(filename, timeout = 30000) {
return new Promise((resolve, reject) => {
const start = Date.now();
const interval = setInterval(() => {
if (fs.existsSync(filename)) {
clearInterval(interval);
resolve();
} else if (Date.now() - start > timeout) {
clearInterval(interval);
reject(new Error('Timeout waiting for file'));
}
}, 500);
});
}
function getNewestFile(dir) {
const files = fs.readdirSync(dir)
.map(f => ({
name: f,
time: fs.statSync(path.join(dir, f)).mtime.getTime()
}))
.sort((a, b) => b.time - a.time);
return files.length ? path.join(dir, files[0].name) : null;
}
const downloadedFilePath = getNewestFile(downloadPath);
if (!downloadedFilePath) {
throw new Error('No XLSX file found in download folder');
}
console.log('Waiting for file:', downloadedFilePath);
await waitForFile(downloadedFilePath);
await fs.promises.cp(downloadedFilePath, "db/current.xlsx");
await parseThisShit(downloadedFilePath);
await clearDownloadsFolder();
await browser.close();
} catch (err) {
if (page) {
await handleError(page, err);
} else {
console.error('Fatal error before page init:', err);
}
if (browser) await browser.close();
process.exit(1);
}
})();