1
0
Files
jecnarozvrh/scrape/scraper.js
2025-08-07 00:44:20 +02:00

149 lines
5.1 KiB
JavaScript

const puppeteer = require('puppeteer');
const path = require('path');
const fs = require('fs');
const parseThisShit = require('./parse');
const EMAIL = process.env.EMAIL;
const PASSWORD = process.env.PASSWORD;
//const SHAREPOINT_URL = 'https://onedrive.live.com/personal/7d8c4d9baeeebde3/_layouts/15/doc2.aspx?resid=2bddf9b7-8613-4ae3-a684-0be6d73d90bf&cid=7d8c4d9baeeebde3&ct=1748937302474&wdOrigin=OFFICECOM-WEB.START.UPLOAD&wdPreviousSessionSrc=HarmonyWeb&wdPreviousSession=ce7df0ab-aade-4df2-9e2e-492e99049666';
const SHAREPOINT_URL = 'https://spsejecnacz.sharepoint.com/:x:/s/nastenka/EbA_RcWKRdRNlB8YU1iuWM4BnMetCQlVm8toHuuyW-TPyA?e=uu3iPR&CID=2686cea0-2d06-3304-4519-087fb9e06fd0';
const VOLUME_PATH = path.resolve('./volume/browser');
async function clearDownloadsFolder() {
try {
await fs.promises.rm('./downloads', { recursive: true, force: true });
await fs.promises.mkdir('./downloads');
} catch (err) {
console.error('Error:', err);
}
}
(async () => {
const browser = await puppeteer.launch({
headless: 'new',
userDataDir: VOLUME_PATH,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const [page] = await browser.pages();
const downloadPath = path.resolve('./downloads');
if (!fs.existsSync(downloadPath)) fs.mkdirSync(downloadPath);
const client = await page.target().createCDPSession();
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: downloadPath,
});
await page.goto(SHAREPOINT_URL, { waitUntil: 'networkidle2' });
await new Promise(r => setTimeout(r, 3000));
if (page.url().includes('login.')) {
console.log('Logging in...');
await page.waitForSelector('input[type="email"]', { timeout: 10000 });
await page.type('input[type="email"]', EMAIL, { delay: 50 });
await page.keyboard.press('Enter');
await new Promise(r => setTimeout(r, 2000));
try {
await page.waitForSelector('div[role="button"]', { timeout: 5000 });
const signInButtons = await page.$$('div[role="button"]');
for (const btn of signInButtons) {
const text = await page.evaluate(el => el.innerText, btn);
if (text && text.includes('with password')) {
console.log('Clicking "Sign in with password"...');
await btn.click();
break;
}
}
} catch (err) {
console.log('"Sign in with password" button not found, continuing...');
}
await page.waitForSelector('input[type="password"]', { timeout: 100000 });
await page.type('input[type="password"]', PASSWORD, { delay: 50 });
await page.keyboard.press('Enter');
try {
await page.waitForSelector('input[type="submit"]', { timeout: 10000 });
await page.click('input[type="submit"]');
} catch {
console.log('No stay signed in prompt.');
}
// wait for navigation after login
await page.waitForNavigation({ waitUntil: 'networkidle2' });
}
// Wait for iframe containing file options
const frameHandle = await page.waitForSelector('iframe');
const frame = await frameHandle.contentFrame();
await frame.waitForSelector('button[title="File"]', { timeout: 60000 });
await frame.click('button[title="File"]');
await new Promise(r => setTimeout(r, 500));
// Click "Create a Copy"
try {
await frame.waitForSelector('div[role="menuitem"][name="Create a Copy"]', { visible: true });
} catch {
await frame.click('button[title="File"]');
await new Promise(r => setTimeout(r, 500));
}
await frame.click('div[role="menuitem"][name="Create a Copy"]');
await new Promise(r => setTimeout(r, 500));
// Click "Download a Copy"
await frame.waitForSelector('div[role="menuitem"][name="Download a Copy"]', { visible: true });
await frame.click('div[role="menuitem"][name="Download a Copy"]');
// Wait some seconds for download to start
await new Promise(r => setTimeout(r, 10000));
// Helper: wait for file to appear in download folder
function waitForFile(filename, timeout = 30000) {
return new Promise((resolve, reject) => {
const start = Date.now();
const interval = setInterval(() => {
if (fs.existsSync(filename)) {
clearInterval(interval);
resolve();
} else if (Date.now() - start > timeout) {
clearInterval(interval);
reject(new Error('Timeout waiting for file'));
}
}, 500);
});
}
// Helper: get newest .xlsx file in downloads folder
function getNewestFile(dir, ext = '.xlsx') {
const files = fs.readdirSync(dir)
.filter(f => f.endsWith(ext))
.map(f => ({
name: f,
time: fs.statSync(path.join(dir, f)).mtime.getTime()
}))
.sort((a, b) => b.time - a.time);
return files.length ? path.join(dir, files[0].name) : null;
}
// Wait for the downloaded file
const downloadedFilePath = getNewestFile(downloadPath, '.xlsx');
if (!downloadedFilePath) {
throw new Error('No XLSX file found in download folder');
}
console.log('Waiting for file:', downloadedFilePath);
await waitForFile(downloadedFilePath);
parseThisShit(downloadedFilePath);
await clearDownloadsFolder();
await browser.close();
})();