initial commit
This commit is contained in:
1
.dockerignore
Normal file
1
.dockerignore
Normal file
@@ -0,0 +1 @@
|
|||||||
|
volume/
|
||||||
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
node_modules
|
||||||
|
volume/browser
|
||||||
|
db
|
||||||
|
downloads
|
||||||
20
Dockerfile
Normal file
20
Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
# Use official Node.js image as base
|
||||||
|
FROM node:18
|
||||||
|
|
||||||
|
# Create app directory
|
||||||
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
|
# Copy package.json and package-lock.json (if available)
|
||||||
|
COPY package*.json ./
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN npm install
|
||||||
|
|
||||||
|
# Copy app source code
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Expose the port your app runs on (optional, depends on your app)
|
||||||
|
EXPOSE 3000
|
||||||
|
|
||||||
|
# Start the app
|
||||||
|
CMD ["npm", "start"]
|
||||||
22
cron-runner.js
Normal file
22
cron-runner.js
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
const cron = require('node-cron');
|
||||||
|
const { exec } = require('child_process');
|
||||||
|
|
||||||
|
function runScraper() {
|
||||||
|
console.log('Running scraper...');
|
||||||
|
exec('node scrape/scraper.js', (error, stdout, stderr) => {
|
||||||
|
if (error) {
|
||||||
|
console.error(`Scraper error: ${error.message}`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (stderr) console.error(`Scraper stderr: ${stderr}`);
|
||||||
|
if (stdout) console.log(`Scraper output:\n${stdout}`);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run immediately at start
|
||||||
|
runScraper();
|
||||||
|
|
||||||
|
// Schedule to run every 10 minutes
|
||||||
|
cron.schedule('*/10 * * * *', runScraper);
|
||||||
|
|
||||||
|
console.log('Cron scheduler started. Scraper will run every 10 minutes.');
|
||||||
12
docker-compose.yml
Normal file
12
docker-compose.yml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
version: "3.9"
|
||||||
|
|
||||||
|
services:
|
||||||
|
app:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- "3000:3000"
|
||||||
|
environment:
|
||||||
|
NODE_ENV: development
|
||||||
|
volumes:
|
||||||
|
- ./volume:./usr/src/app/volume
|
||||||
|
command: npm start
|
||||||
2312
package-lock.json
generated
Normal file
2312
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
22
package.json
Normal file
22
package.json
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
{
|
||||||
|
"name": "tablescraper",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "",
|
||||||
|
"license": "ISC",
|
||||||
|
"author": "Jakub Žitník",
|
||||||
|
"type": "commonjs",
|
||||||
|
"main": "server.js",
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1",
|
||||||
|
"start": "concurrently \"node server.js\" \"node cron-runner.js\""
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"axios": "^1.9.0",
|
||||||
|
"concurrently": "^9.2.0",
|
||||||
|
"express": "^5.1.0",
|
||||||
|
"node-cron": "^4.2.1",
|
||||||
|
"openai": "^5.0.2",
|
||||||
|
"puppeteer": "^24.10.0",
|
||||||
|
"xlsx": "^0.18.5"
|
||||||
|
}
|
||||||
|
}
|
||||||
119
scrape/parse.js
Normal file
119
scrape/parse.js
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
const XLSX = require('xlsx');
|
||||||
|
const fs = require("fs");
|
||||||
|
|
||||||
|
function parseThisShit(downloadedFilePath) {
|
||||||
|
const workbook = XLSX.readFile(downloadedFilePath);
|
||||||
|
const sheetNames = workbook.SheetNames;
|
||||||
|
|
||||||
|
const dateRegex = /^(pondělí|úterý|středa|čtvrtek|pátek)\s+(\d{1,2})\.\s*(\d{1,2})\.\s*(20\d{2})/i;
|
||||||
|
|
||||||
|
// Get today's date for comparison
|
||||||
|
function getCurrentDateObject() {
|
||||||
|
const now = new Date();
|
||||||
|
return new Date(now.getFullYear(), now.getMonth(), now.getDate());
|
||||||
|
}
|
||||||
|
|
||||||
|
const today = getCurrentDateObject();
|
||||||
|
|
||||||
|
const upcomingSheets = sheetNames.filter(name => {
|
||||||
|
const match = name.match(dateRegex);
|
||||||
|
if (!match) return false;
|
||||||
|
|
||||||
|
const day = parseInt(match[2], 10);
|
||||||
|
const month = parseInt(match[3], 10) - 1; // JavaScript months are 0-indexed
|
||||||
|
const year = parseInt(match[4], 10);
|
||||||
|
|
||||||
|
const sheetDate = new Date(year, month, day);
|
||||||
|
|
||||||
|
return sheetDate >= today;
|
||||||
|
});
|
||||||
|
|
||||||
|
const final = [];
|
||||||
|
|
||||||
|
let finalIndex = 0;
|
||||||
|
for (const key of upcomingSheets) {
|
||||||
|
const currentSheet = workbook.Sheets[key];
|
||||||
|
final.push({});
|
||||||
|
|
||||||
|
const regex = /[AEC][0-4][a-c]?\/.*/s;
|
||||||
|
const prefixRegex = /[AEC][0-4][a-c]?/;
|
||||||
|
const classes = [];
|
||||||
|
const matchingKeys = Object.keys(currentSheet).filter(key => {
|
||||||
|
const value = currentSheet[key];
|
||||||
|
|
||||||
|
const testResult = regex.test(value.v);
|
||||||
|
if (testResult) {
|
||||||
|
const prefixMatch = value.v.match(prefixRegex);
|
||||||
|
if (prefixMatch) {
|
||||||
|
const prefix = prefixMatch[0];
|
||||||
|
classes.push(prefix);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return testResult;
|
||||||
|
});
|
||||||
|
|
||||||
|
function letterToNumber(letter) {
|
||||||
|
return letter.toLowerCase().charCodeAt(0) - 'a'.charCodeAt(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// For each class
|
||||||
|
let classI = 0;
|
||||||
|
for (const matchingKey of matchingKeys) {
|
||||||
|
const allKeys = Object.keys(currentSheet).filter(key => key !== matchingKey && key.endsWith(matchingKey.replace(/[a-z]/gi, '')));
|
||||||
|
|
||||||
|
const final2 = [];
|
||||||
|
|
||||||
|
for (const key of allKeys) {
|
||||||
|
const parsedKey = letterToNumber(key.replace(/[0-9]/gi, ''));
|
||||||
|
let d = true;
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (currentSheet[key]["w"].startsWith("úklid")) {
|
||||||
|
d = false;
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
|
||||||
|
if (d) {
|
||||||
|
final2[parsedKey] = currentSheet[key]["w"];
|
||||||
|
} else {
|
||||||
|
final2[parsedKey] = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final[finalIndex][classes[classI]] = final2.slice(1, 11);;
|
||||||
|
|
||||||
|
classI++;
|
||||||
|
}
|
||||||
|
|
||||||
|
finalIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = {
|
||||||
|
schedule: final,
|
||||||
|
props: upcomingSheets.map(str => {
|
||||||
|
const dateMatch = str.match(/(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})/);
|
||||||
|
|
||||||
|
let date = null;
|
||||||
|
|
||||||
|
if (dateMatch) {
|
||||||
|
const day = parseInt(dateMatch[1], 10);
|
||||||
|
const month = parseInt(dateMatch[2], 10);
|
||||||
|
const year = parseInt(dateMatch[3], 10);
|
||||||
|
|
||||||
|
date = new Date(year, month - 1, day);
|
||||||
|
}
|
||||||
|
|
||||||
|
const isPriprava = /priprava/i.test(str);
|
||||||
|
|
||||||
|
return {
|
||||||
|
date: date ? date.toISOString().slice(0,10) : null, // ISO yyyy-mm-dd string for easy use
|
||||||
|
priprava: isPriprava,
|
||||||
|
};
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
|
||||||
|
fs.writeFileSync('db/current.json', JSON.stringify(data));
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = parseThisShit;
|
||||||
133
scrape/scraper.js
Normal file
133
scrape/scraper.js
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
const puppeteer = require('puppeteer');
|
||||||
|
const path = require('path');
|
||||||
|
const fs = require('fs');
|
||||||
|
const parseThisShit = require('./parse');
|
||||||
|
|
||||||
|
const EMAIL = process.env.EMAIL;
|
||||||
|
const PASSWORD = process.env.PASSWORD;
|
||||||
|
//const SHAREPOINT_URL = 'https://onedrive.live.com/personal/7d8c4d9baeeebde3/_layouts/15/doc2.aspx?resid=2bddf9b7-8613-4ae3-a684-0be6d73d90bf&cid=7d8c4d9baeeebde3&ct=1748937302474&wdOrigin=OFFICECOM-WEB.START.UPLOAD&wdPreviousSessionSrc=HarmonyWeb&wdPreviousSession=ce7df0ab-aade-4df2-9e2e-492e99049666';
|
||||||
|
const SHAREPOINT_URL = 'https://spsejecnacz.sharepoint.com/:x:/s/nastenka/EbA_RcWKRdRNlB8YU1iuWM4BnMetCQlVm8toHuuyW-TPyA?e=uu3iPR&CID=2686cea0-2d06-3304-4519-087fb9e06fd0';
|
||||||
|
const VOLUME_PATH = path.resolve('./volume/browser');
|
||||||
|
|
||||||
|
async function clearDownloadsFolder() {
|
||||||
|
try {
|
||||||
|
await fs.promises.rm('./downloads', { recursive: true, force: true });
|
||||||
|
await fs.promises.mkdir('./downloads');
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Error:', err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: 'new',
|
||||||
|
//headless: false,
|
||||||
|
userDataDir: VOLUME_PATH
|
||||||
|
});
|
||||||
|
const [page] = await browser.pages();
|
||||||
|
|
||||||
|
const downloadPath = path.resolve('./downloads');
|
||||||
|
if (!fs.existsSync(downloadPath)) fs.mkdirSync(downloadPath);
|
||||||
|
|
||||||
|
const client = await page.target().createCDPSession();
|
||||||
|
await client.send('Page.setDownloadBehavior', {
|
||||||
|
behavior: 'allow',
|
||||||
|
downloadPath: downloadPath,
|
||||||
|
});
|
||||||
|
|
||||||
|
await page.goto(SHAREPOINT_URL, { waitUntil: 'networkidle2' });
|
||||||
|
|
||||||
|
await new Promise(r => setTimeout(r, 3000));
|
||||||
|
|
||||||
|
if (page.url().includes('login.')) {
|
||||||
|
console.log('Logging in...');
|
||||||
|
|
||||||
|
await page.waitForSelector('input[type="email"]', { timeout: 10000 });
|
||||||
|
await page.type('input[type="email"]', EMAIL, { delay: 50 });
|
||||||
|
await page.keyboard.press('Enter');
|
||||||
|
await new Promise(r => setTimeout(r, 2000));
|
||||||
|
|
||||||
|
await page.waitForSelector('input[type="password"]', { timeout: 10000 });
|
||||||
|
await page.type('input[type="password"]', PASSWORD, { delay: 50 });
|
||||||
|
await page.keyboard.press('Enter');
|
||||||
|
|
||||||
|
try {
|
||||||
|
await page.waitForSelector('input[type="submit"]', { timeout: 10000 });
|
||||||
|
await page.click('input[type="submit"]');
|
||||||
|
} catch {
|
||||||
|
console.log('No stay signed in prompt.');
|
||||||
|
}
|
||||||
|
|
||||||
|
// wait for navigation after login
|
||||||
|
await page.waitForNavigation({ waitUntil: 'networkidle2' });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for iframe containing file options
|
||||||
|
const frameHandle = await page.waitForSelector('iframe');
|
||||||
|
const frame = await frameHandle.contentFrame();
|
||||||
|
|
||||||
|
await frame.waitForSelector('button[title="File"]', { timeout: 60000 });
|
||||||
|
await frame.click('button[title="File"]');
|
||||||
|
|
||||||
|
await new Promise(r => setTimeout(r, 500));
|
||||||
|
|
||||||
|
// Click "Create a Copy"
|
||||||
|
try {
|
||||||
|
await frame.waitForSelector('div[role="menuitem"][name="Create a Copy"]', { visible: true });
|
||||||
|
} catch {
|
||||||
|
await frame.click('button[title="File"]');
|
||||||
|
await new Promise(r => setTimeout(r, 500));
|
||||||
|
}
|
||||||
|
await frame.click('div[role="menuitem"][name="Create a Copy"]');
|
||||||
|
|
||||||
|
await new Promise(r => setTimeout(r, 500));
|
||||||
|
|
||||||
|
// Click "Download a Copy"
|
||||||
|
await frame.waitForSelector('div[role="menuitem"][name="Download a Copy"]', { visible: true });
|
||||||
|
await frame.click('div[role="menuitem"][name="Download a Copy"]');
|
||||||
|
|
||||||
|
// Wait some seconds for download to start
|
||||||
|
await new Promise(r => setTimeout(r, 10000));
|
||||||
|
|
||||||
|
// Helper: wait for file to appear in download folder
|
||||||
|
function waitForFile(filename, timeout = 30000) {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const start = Date.now();
|
||||||
|
const interval = setInterval(() => {
|
||||||
|
if (fs.existsSync(filename)) {
|
||||||
|
clearInterval(interval);
|
||||||
|
resolve();
|
||||||
|
} else if (Date.now() - start > timeout) {
|
||||||
|
clearInterval(interval);
|
||||||
|
reject(new Error('Timeout waiting for file'));
|
||||||
|
}
|
||||||
|
}, 500);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper: get newest .xlsx file in downloads folder
|
||||||
|
function getNewestFile(dir, ext = '.xlsx') {
|
||||||
|
const files = fs.readdirSync(dir)
|
||||||
|
.filter(f => f.endsWith(ext))
|
||||||
|
.map(f => ({
|
||||||
|
name: f,
|
||||||
|
time: fs.statSync(path.join(dir, f)).mtime.getTime()
|
||||||
|
}))
|
||||||
|
.sort((a, b) => b.time - a.time);
|
||||||
|
return files.length ? path.join(dir, files[0].name) : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for the downloaded file
|
||||||
|
const downloadedFilePath = getNewestFile(downloadPath, '.xlsx');
|
||||||
|
if (!downloadedFilePath) {
|
||||||
|
throw new Error('No XLSX file found in download folder');
|
||||||
|
}
|
||||||
|
console.log('Waiting for file:', downloadedFilePath);
|
||||||
|
await waitForFile(downloadedFilePath);
|
||||||
|
|
||||||
|
parseThisShit(downloadedFilePath);
|
||||||
|
|
||||||
|
await clearDownloadsFolder();
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
})();
|
||||||
27
server.js
Normal file
27
server.js
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
const express = require("express");
|
||||||
|
const path = require("path");
|
||||||
|
const app = express();
|
||||||
|
const fs = require("fs/promises");
|
||||||
|
|
||||||
|
const PORT = 3000;
|
||||||
|
|
||||||
|
app.get('/', (_, res) => {
|
||||||
|
res.sendFile(path.join(__dirname, "db", "current.json"));
|
||||||
|
});
|
||||||
|
|
||||||
|
app.get("/status", async (_, res) => {
|
||||||
|
const dataStr = await fs.readFile(path.resolve("./volume/customState.json"), {encoding: "utf8"});
|
||||||
|
const data = JSON.parse(dataStr);
|
||||||
|
|
||||||
|
if (data.working) {
|
||||||
|
res.json({working: true})
|
||||||
|
} else {
|
||||||
|
res.json({working: data.working, message: data.message})
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
// TODO: Reporting errors
|
||||||
|
|
||||||
|
app.listen(PORT, () => {
|
||||||
|
console.log(`Server is running at http://localhost:${PORT}`);
|
||||||
|
});
|
||||||
4
volume/customState.json
Normal file
4
volume/customState.json
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
{
|
||||||
|
"working": true,
|
||||||
|
"message": ""
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user