initial commit
This commit is contained in:
1
.dockerignore
Normal file
1
.dockerignore
Normal file
@@ -0,0 +1 @@
|
||||
volume/
|
||||
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
node_modules
|
||||
volume/browser
|
||||
db
|
||||
downloads
|
||||
20
Dockerfile
Normal file
20
Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
||||
# Use official Node.js image as base
|
||||
FROM node:18
|
||||
|
||||
# Create app directory
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Copy package.json and package-lock.json (if available)
|
||||
COPY package*.json ./
|
||||
|
||||
# Install dependencies
|
||||
RUN npm install
|
||||
|
||||
# Copy app source code
|
||||
COPY . .
|
||||
|
||||
# Expose the port your app runs on (optional, depends on your app)
|
||||
EXPOSE 3000
|
||||
|
||||
# Start the app
|
||||
CMD ["npm", "start"]
|
||||
22
cron-runner.js
Normal file
22
cron-runner.js
Normal file
@@ -0,0 +1,22 @@
|
||||
const cron = require('node-cron');
|
||||
const { exec } = require('child_process');
|
||||
|
||||
function runScraper() {
|
||||
console.log('Running scraper...');
|
||||
exec('node scrape/scraper.js', (error, stdout, stderr) => {
|
||||
if (error) {
|
||||
console.error(`Scraper error: ${error.message}`);
|
||||
return;
|
||||
}
|
||||
if (stderr) console.error(`Scraper stderr: ${stderr}`);
|
||||
if (stdout) console.log(`Scraper output:\n${stdout}`);
|
||||
});
|
||||
}
|
||||
|
||||
// Run immediately at start
|
||||
runScraper();
|
||||
|
||||
// Schedule to run every 10 minutes
|
||||
cron.schedule('*/10 * * * *', runScraper);
|
||||
|
||||
console.log('Cron scheduler started. Scraper will run every 10 minutes.');
|
||||
12
docker-compose.yml
Normal file
12
docker-compose.yml
Normal file
@@ -0,0 +1,12 @@
|
||||
version: "3.9"
|
||||
|
||||
services:
|
||||
app:
|
||||
build: .
|
||||
ports:
|
||||
- "3000:3000"
|
||||
environment:
|
||||
NODE_ENV: development
|
||||
volumes:
|
||||
- ./volume:./usr/src/app/volume
|
||||
command: npm start
|
||||
2312
package-lock.json
generated
Normal file
2312
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
22
package.json
Normal file
22
package.json
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "tablescraper",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"license": "ISC",
|
||||
"author": "Jakub Žitník",
|
||||
"type": "commonjs",
|
||||
"main": "server.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1",
|
||||
"start": "concurrently \"node server.js\" \"node cron-runner.js\""
|
||||
},
|
||||
"dependencies": {
|
||||
"axios": "^1.9.0",
|
||||
"concurrently": "^9.2.0",
|
||||
"express": "^5.1.0",
|
||||
"node-cron": "^4.2.1",
|
||||
"openai": "^5.0.2",
|
||||
"puppeteer": "^24.10.0",
|
||||
"xlsx": "^0.18.5"
|
||||
}
|
||||
}
|
||||
119
scrape/parse.js
Normal file
119
scrape/parse.js
Normal file
@@ -0,0 +1,119 @@
|
||||
const XLSX = require('xlsx');
|
||||
const fs = require("fs");
|
||||
|
||||
function parseThisShit(downloadedFilePath) {
|
||||
const workbook = XLSX.readFile(downloadedFilePath);
|
||||
const sheetNames = workbook.SheetNames;
|
||||
|
||||
const dateRegex = /^(pondělí|úterý|středa|čtvrtek|pátek)\s+(\d{1,2})\.\s*(\d{1,2})\.\s*(20\d{2})/i;
|
||||
|
||||
// Get today's date for comparison
|
||||
function getCurrentDateObject() {
|
||||
const now = new Date();
|
||||
return new Date(now.getFullYear(), now.getMonth(), now.getDate());
|
||||
}
|
||||
|
||||
const today = getCurrentDateObject();
|
||||
|
||||
const upcomingSheets = sheetNames.filter(name => {
|
||||
const match = name.match(dateRegex);
|
||||
if (!match) return false;
|
||||
|
||||
const day = parseInt(match[2], 10);
|
||||
const month = parseInt(match[3], 10) - 1; // JavaScript months are 0-indexed
|
||||
const year = parseInt(match[4], 10);
|
||||
|
||||
const sheetDate = new Date(year, month, day);
|
||||
|
||||
return sheetDate >= today;
|
||||
});
|
||||
|
||||
const final = [];
|
||||
|
||||
let finalIndex = 0;
|
||||
for (const key of upcomingSheets) {
|
||||
const currentSheet = workbook.Sheets[key];
|
||||
final.push({});
|
||||
|
||||
const regex = /[AEC][0-4][a-c]?\/.*/s;
|
||||
const prefixRegex = /[AEC][0-4][a-c]?/;
|
||||
const classes = [];
|
||||
const matchingKeys = Object.keys(currentSheet).filter(key => {
|
||||
const value = currentSheet[key];
|
||||
|
||||
const testResult = regex.test(value.v);
|
||||
if (testResult) {
|
||||
const prefixMatch = value.v.match(prefixRegex);
|
||||
if (prefixMatch) {
|
||||
const prefix = prefixMatch[0];
|
||||
classes.push(prefix);
|
||||
}
|
||||
}
|
||||
|
||||
return testResult;
|
||||
});
|
||||
|
||||
function letterToNumber(letter) {
|
||||
return letter.toLowerCase().charCodeAt(0) - 'a'.charCodeAt(0);
|
||||
}
|
||||
|
||||
// For each class
|
||||
let classI = 0;
|
||||
for (const matchingKey of matchingKeys) {
|
||||
const allKeys = Object.keys(currentSheet).filter(key => key !== matchingKey && key.endsWith(matchingKey.replace(/[a-z]/gi, '')));
|
||||
|
||||
const final2 = [];
|
||||
|
||||
for (const key of allKeys) {
|
||||
const parsedKey = letterToNumber(key.replace(/[0-9]/gi, ''));
|
||||
let d = true;
|
||||
|
||||
try {
|
||||
if (currentSheet[key]["w"].startsWith("úklid")) {
|
||||
d = false;
|
||||
}
|
||||
} catch {}
|
||||
|
||||
if (d) {
|
||||
final2[parsedKey] = currentSheet[key]["w"];
|
||||
} else {
|
||||
final2[parsedKey] = null;
|
||||
}
|
||||
}
|
||||
|
||||
final[finalIndex][classes[classI]] = final2.slice(1, 11);;
|
||||
|
||||
classI++;
|
||||
}
|
||||
|
||||
finalIndex++;
|
||||
}
|
||||
|
||||
const data = {
|
||||
schedule: final,
|
||||
props: upcomingSheets.map(str => {
|
||||
const dateMatch = str.match(/(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})/);
|
||||
|
||||
let date = null;
|
||||
|
||||
if (dateMatch) {
|
||||
const day = parseInt(dateMatch[1], 10);
|
||||
const month = parseInt(dateMatch[2], 10);
|
||||
const year = parseInt(dateMatch[3], 10);
|
||||
|
||||
date = new Date(year, month - 1, day);
|
||||
}
|
||||
|
||||
const isPriprava = /priprava/i.test(str);
|
||||
|
||||
return {
|
||||
date: date ? date.toISOString().slice(0,10) : null, // ISO yyyy-mm-dd string for easy use
|
||||
priprava: isPriprava,
|
||||
};
|
||||
}),
|
||||
}
|
||||
|
||||
fs.writeFileSync('db/current.json', JSON.stringify(data));
|
||||
}
|
||||
|
||||
module.exports = parseThisShit;
|
||||
133
scrape/scraper.js
Normal file
133
scrape/scraper.js
Normal file
@@ -0,0 +1,133 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const parseThisShit = require('./parse');
|
||||
|
||||
const EMAIL = process.env.EMAIL;
|
||||
const PASSWORD = process.env.PASSWORD;
|
||||
//const SHAREPOINT_URL = 'https://onedrive.live.com/personal/7d8c4d9baeeebde3/_layouts/15/doc2.aspx?resid=2bddf9b7-8613-4ae3-a684-0be6d73d90bf&cid=7d8c4d9baeeebde3&ct=1748937302474&wdOrigin=OFFICECOM-WEB.START.UPLOAD&wdPreviousSessionSrc=HarmonyWeb&wdPreviousSession=ce7df0ab-aade-4df2-9e2e-492e99049666';
|
||||
const SHAREPOINT_URL = 'https://spsejecnacz.sharepoint.com/:x:/s/nastenka/EbA_RcWKRdRNlB8YU1iuWM4BnMetCQlVm8toHuuyW-TPyA?e=uu3iPR&CID=2686cea0-2d06-3304-4519-087fb9e06fd0';
|
||||
const VOLUME_PATH = path.resolve('./volume/browser');
|
||||
|
||||
async function clearDownloadsFolder() {
|
||||
try {
|
||||
await fs.promises.rm('./downloads', { recursive: true, force: true });
|
||||
await fs.promises.mkdir('./downloads');
|
||||
} catch (err) {
|
||||
console.error('Error:', err);
|
||||
}
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
//headless: false,
|
||||
userDataDir: VOLUME_PATH
|
||||
});
|
||||
const [page] = await browser.pages();
|
||||
|
||||
const downloadPath = path.resolve('./downloads');
|
||||
if (!fs.existsSync(downloadPath)) fs.mkdirSync(downloadPath);
|
||||
|
||||
const client = await page.target().createCDPSession();
|
||||
await client.send('Page.setDownloadBehavior', {
|
||||
behavior: 'allow',
|
||||
downloadPath: downloadPath,
|
||||
});
|
||||
|
||||
await page.goto(SHAREPOINT_URL, { waitUntil: 'networkidle2' });
|
||||
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
if (page.url().includes('login.')) {
|
||||
console.log('Logging in...');
|
||||
|
||||
await page.waitForSelector('input[type="email"]', { timeout: 10000 });
|
||||
await page.type('input[type="email"]', EMAIL, { delay: 50 });
|
||||
await page.keyboard.press('Enter');
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
await page.waitForSelector('input[type="password"]', { timeout: 10000 });
|
||||
await page.type('input[type="password"]', PASSWORD, { delay: 50 });
|
||||
await page.keyboard.press('Enter');
|
||||
|
||||
try {
|
||||
await page.waitForSelector('input[type="submit"]', { timeout: 10000 });
|
||||
await page.click('input[type="submit"]');
|
||||
} catch {
|
||||
console.log('No stay signed in prompt.');
|
||||
}
|
||||
|
||||
// wait for navigation after login
|
||||
await page.waitForNavigation({ waitUntil: 'networkidle2' });
|
||||
}
|
||||
|
||||
// Wait for iframe containing file options
|
||||
const frameHandle = await page.waitForSelector('iframe');
|
||||
const frame = await frameHandle.contentFrame();
|
||||
|
||||
await frame.waitForSelector('button[title="File"]', { timeout: 60000 });
|
||||
await frame.click('button[title="File"]');
|
||||
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
|
||||
// Click "Create a Copy"
|
||||
try {
|
||||
await frame.waitForSelector('div[role="menuitem"][name="Create a Copy"]', { visible: true });
|
||||
} catch {
|
||||
await frame.click('button[title="File"]');
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
}
|
||||
await frame.click('div[role="menuitem"][name="Create a Copy"]');
|
||||
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
|
||||
// Click "Download a Copy"
|
||||
await frame.waitForSelector('div[role="menuitem"][name="Download a Copy"]', { visible: true });
|
||||
await frame.click('div[role="menuitem"][name="Download a Copy"]');
|
||||
|
||||
// Wait some seconds for download to start
|
||||
await new Promise(r => setTimeout(r, 10000));
|
||||
|
||||
// Helper: wait for file to appear in download folder
|
||||
function waitForFile(filename, timeout = 30000) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const start = Date.now();
|
||||
const interval = setInterval(() => {
|
||||
if (fs.existsSync(filename)) {
|
||||
clearInterval(interval);
|
||||
resolve();
|
||||
} else if (Date.now() - start > timeout) {
|
||||
clearInterval(interval);
|
||||
reject(new Error('Timeout waiting for file'));
|
||||
}
|
||||
}, 500);
|
||||
});
|
||||
}
|
||||
|
||||
// Helper: get newest .xlsx file in downloads folder
|
||||
function getNewestFile(dir, ext = '.xlsx') {
|
||||
const files = fs.readdirSync(dir)
|
||||
.filter(f => f.endsWith(ext))
|
||||
.map(f => ({
|
||||
name: f,
|
||||
time: fs.statSync(path.join(dir, f)).mtime.getTime()
|
||||
}))
|
||||
.sort((a, b) => b.time - a.time);
|
||||
return files.length ? path.join(dir, files[0].name) : null;
|
||||
}
|
||||
|
||||
// Wait for the downloaded file
|
||||
const downloadedFilePath = getNewestFile(downloadPath, '.xlsx');
|
||||
if (!downloadedFilePath) {
|
||||
throw new Error('No XLSX file found in download folder');
|
||||
}
|
||||
console.log('Waiting for file:', downloadedFilePath);
|
||||
await waitForFile(downloadedFilePath);
|
||||
|
||||
parseThisShit(downloadedFilePath);
|
||||
|
||||
await clearDownloadsFolder();
|
||||
|
||||
await browser.close();
|
||||
})();
|
||||
27
server.js
Normal file
27
server.js
Normal file
@@ -0,0 +1,27 @@
|
||||
const express = require("express");
|
||||
const path = require("path");
|
||||
const app = express();
|
||||
const fs = require("fs/promises");
|
||||
|
||||
const PORT = 3000;
|
||||
|
||||
app.get('/', (_, res) => {
|
||||
res.sendFile(path.join(__dirname, "db", "current.json"));
|
||||
});
|
||||
|
||||
app.get("/status", async (_, res) => {
|
||||
const dataStr = await fs.readFile(path.resolve("./volume/customState.json"), {encoding: "utf8"});
|
||||
const data = JSON.parse(dataStr);
|
||||
|
||||
if (data.working) {
|
||||
res.json({working: true})
|
||||
} else {
|
||||
res.json({working: data.working, message: data.message})
|
||||
}
|
||||
})
|
||||
|
||||
// TODO: Reporting errors
|
||||
|
||||
app.listen(PORT, () => {
|
||||
console.log(`Server is running at http://localhost:${PORT}`);
|
||||
});
|
||||
4
volume/customState.json
Normal file
4
volume/customState.json
Normal file
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"working": true,
|
||||
"message": ""
|
||||
}
|
||||
Reference in New Issue
Block a user