From 4985d574d5e6f2a48fcde3a43beef3161e2d4b4e Mon Sep 17 00:00:00 2001 From: jzitnik-dev Date: Sun, 31 Aug 2025 16:35:22 +0200 Subject: [PATCH] feat: Absences --- package-lock.json | 301 +++++++++++++++++++++++++++++++--- package.json | 2 +- scrape/parse.js | 26 ++- scrape/scraper.js | 2 +- scrape/utils/parseAbsence.js | 44 +++++ scrape/utils/parseTeachers.js | 25 +++ 6 files changed, 374 insertions(+), 26 deletions(-) create mode 100644 scrape/utils/parseAbsence.js create mode 100644 scrape/utils/parseTeachers.js diff --git a/package-lock.json b/package-lock.json index cc17ef7..d5d2071 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,10 +10,10 @@ "license": "ISC", "dependencies": { "axios": "^1.9.0", + "cheerio": "^1.1.2", "concurrently": "^9.2.0", "express": "^5.1.0", "node-cron": "^4.2.1", - "openai": "^5.0.2", "puppeteer": "^24.10.0", "xlsx": "^0.18.5" } @@ -306,6 +306,12 @@ "node": ">=18" } }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==", + "license": "ISC" + }, "node_modules/buffer-crc32": { "version": "0.2.13", "resolved": "https://registry.npmjs.org/buffer-crc32/-/buffer-crc32-0.2.13.tgz", @@ -403,6 +409,48 @@ "node": ">=8" } }, + "node_modules/cheerio": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.1.2.tgz", + "integrity": "sha512-IkxPpb5rS/d1IiLbHMgfPuS0FgiWTtFIm/Nj+2woXDLTZ7fOT2eqzgYbdMlLweqlHbsZjxEChoVK+7iph7jyQg==", + "license": "MIT", + "dependencies": { + "cheerio-select": "^2.1.0", + "dom-serializer": "^2.0.0", + "domhandler": "^5.0.3", + "domutils": "^3.2.2", + "encoding-sniffer": "^0.2.1", + "htmlparser2": "^10.0.0", + "parse5": "^7.3.0", + "parse5-htmlparser2-tree-adapter": "^7.1.0", + "parse5-parser-stream": "^7.1.2", + "undici": "^7.12.0", + "whatwg-mimetype": "^4.0.0" + }, + "engines": { + "node": ">=20.18.1" + }, + "funding": { + "url": "https://github.com/cheeriojs/cheerio?sponsor=1" + } + }, + "node_modules/cheerio-select": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz", + "integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0", + "css-select": "^5.1.0", + "css-what": "^6.1.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, "node_modules/chromium-bidi": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-5.1.0.tgz", @@ -571,6 +619,34 @@ "node": ">=0.8" } }, + "node_modules/css-select": { + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.2.2.tgz", + "integrity": "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-what": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.2.2.tgz", + "integrity": "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==", + "license": "BSD-2-Clause", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, "node_modules/data-uri-to-buffer": { "version": "6.0.2", "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz", @@ -635,6 +711,61 @@ "integrity": "sha512-FOFDVMGrAUNp0dDKsAU1TorWJUx2JOU1k9xdgBKKJF3IBh/Uhl2yswG5r3TEAOrCiGY2QRp1e6LVDQrCsTKO4g==", "license": "BSD-3-Clause" }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "BSD-2-Clause" + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "license": "BSD-2-Clause", + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz", + "integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==", + "license": "BSD-2-Clause", + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -670,6 +801,19 @@ "node": ">= 0.8" } }, + "node_modules/encoding-sniffer": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.1.tgz", + "integrity": "sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw==", + "license": "MIT", + "dependencies": { + "iconv-lite": "^0.6.3", + "whatwg-encoding": "^3.1.1" + }, + "funding": { + "url": "https://github.com/fb55/encoding-sniffer?sponsor=1" + } + }, "node_modules/end-of-stream": { "version": "1.4.4", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz", @@ -679,6 +823,18 @@ "once": "^1.4.0" } }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/env-paths": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz", @@ -1139,6 +1295,37 @@ "node": ">= 0.4" } }, + "node_modules/htmlparser2": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz", + "integrity": "sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.2.1", + "entities": "^6.0.0" + } + }, + "node_modules/htmlparser2/node_modules/entities": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", + "integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/http-errors": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz", @@ -1408,6 +1595,18 @@ "node": ">=6.0.0" } }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, "node_modules/object-inspect": { "version": "1.13.4", "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz", @@ -1441,27 +1640,6 @@ "wrappy": "1" } }, - "node_modules/openai": { - "version": "5.0.2", - "resolved": "https://registry.npmjs.org/openai/-/openai-5.0.2.tgz", - "integrity": "sha512-NN7LAAImgBmd4RIe6WyRpLmwCbn+HQ1iaXeIG7K9DM3Auy/G2waKFhrDfRgaEeY0UUPnm6nohaCsqcS+zO8+2g==", - "license": "Apache-2.0", - "bin": { - "openai": "bin/cli" - }, - "peerDependencies": { - "ws": "^8.18.0", - "zod": "^3.23.8" - }, - "peerDependenciesMeta": { - "ws": { - "optional": true - }, - "zod": { - "optional": true - } - } - }, "node_modules/pac-proxy-agent": { "version": "7.2.0", "resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz", @@ -1524,6 +1702,55 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/parse5": { + "version": "7.3.0", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz", + "integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==", + "license": "MIT", + "dependencies": { + "entities": "^6.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-htmlparser2-tree-adapter": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz", + "integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==", + "license": "MIT", + "dependencies": { + "domhandler": "^5.0.3", + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-parser-stream": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz", + "integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==", + "license": "MIT", + "dependencies": { + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5/node_modules/entities": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", + "integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/parseurl": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", @@ -2145,6 +2372,15 @@ "integrity": "sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==", "license": "MIT" }, + "node_modules/undici": { + "version": "7.15.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-7.15.0.tgz", + "integrity": "sha512-7oZJCPvvMvTd0OlqWsIxTuItTpJBpU1tcbVl24FMn3xt3+VSunwUasmfPJRE57oNO1KsZ4PgA1xTdAX4hq8NyQ==", + "license": "MIT", + "engines": { + "node": ">=20.18.1" + } + }, "node_modules/undici-types": { "version": "6.21.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", @@ -2170,6 +2406,27 @@ "node": ">= 0.8" } }, + "node_modules/whatwg-encoding": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", + "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", + "license": "MIT", + "dependencies": { + "iconv-lite": "0.6.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-mimetype": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz", + "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==", + "license": "MIT", + "engines": { + "node": ">=18" + } + }, "node_modules/wmf": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/wmf/-/wmf-1.0.2.tgz", diff --git a/package.json b/package.json index c8c0541..bb5a250 100644 --- a/package.json +++ b/package.json @@ -11,10 +11,10 @@ }, "dependencies": { "axios": "^1.9.0", + "cheerio": "^1.1.2", "concurrently": "^9.2.0", "express": "^5.1.0", "node-cron": "^4.2.1", - "openai": "^5.0.2", "puppeteer": "^24.10.0", "xlsx": "^0.18.5" } diff --git a/scrape/parse.js b/scrape/parse.js index b46b7c0..3a9b46b 100644 --- a/scrape/parse.js +++ b/scrape/parse.js @@ -1,7 +1,9 @@ const XLSX = require('xlsx'); const fs = require("fs"); +const parseAbsence = require('./utils/parseAbsence'); +const parseTeachers = require('./utils/parseTeachers'); -function parseThisShit(downloadedFilePath) { +async function parseThisShit(downloadedFilePath) { const workbook = XLSX.readFile(downloadedFilePath); const sheetNames = workbook.SheetNames; @@ -60,7 +62,7 @@ function parseThisShit(downloadedFilePath) { // For each class let classI = 0; for (const matchingKey of matchingKeys) { - const allKeys = Object.keys(currentSheet).filter(key => key !== matchingKey && key.endsWith(matchingKey.replace(/[a-z]/gi, ''))); + const allKeys = Object.keys(currentSheet).filter(key => key !== matchingKey && key.replace(/[a-z]/gi, '') == matchingKey.replace(/[a-z]/gi, '')); const final2 = []; @@ -86,6 +88,24 @@ function parseThisShit(downloadedFilePath) { classI++; } + // ABSENCE + final[finalIndex]["ABSENCE"] = []; + const absenceKey = Object.keys(currentSheet).find(key => { + const value = (typeof currentSheet[key].v == "string" ? currentSheet[key].v : "").trim().toLowerCase(); + return value == "absence"; + }); + const teacherMap = await parseTeachers(); + const allAbsenceKeys = Object.keys(currentSheet).filter(key => key !== absenceKey && key.replace(/[a-z]/gi, '') == absenceKey.replace(/[a-z]/gi, '')); + for (const absenceKeyCur of allAbsenceKeys) { + const value = currentSheet[absenceKeyCur]["v"].trim(); + if (value.length === 0) { + continue; + } + + const data = parseAbsence(value, teacherMap); + final[finalIndex]["ABSENCE"].push(data); + } + finalIndex++; } @@ -119,3 +139,5 @@ function parseThisShit(downloadedFilePath) { } module.exports = parseThisShit; + +// parseThisShit("downloads/table.xlsx"); diff --git a/scrape/scraper.js b/scrape/scraper.js index c35f4f8..6ba6843 100644 --- a/scrape/scraper.js +++ b/scrape/scraper.js @@ -144,7 +144,7 @@ async function clearDownloadsFolder() { console.log('Waiting for file:', downloadedFilePath); await waitForFile(downloadedFilePath); - parseThisShit(downloadedFilePath); + await parseThisShit(downloadedFilePath); await clearDownloadsFolder(); diff --git a/scrape/utils/parseAbsence.js b/scrape/utils/parseAbsence.js new file mode 100644 index 0000000..43a5f32 --- /dev/null +++ b/scrape/utils/parseAbsence.js @@ -0,0 +1,44 @@ +function parseAbsence(str, teacherMap) { + str = str.trim().replace(/\s+/g, " "); + + const result = { + teacher: null, // String or null + teacherCode: null, // String or null + type: null, // "wholeDay" | "range" | "single" | "invalid" + hours: null // { from: number, to: number } or number + }; + + // Regex patterns (with flexible spaces) + const wholeDayPattern = /^([A-Za-z]+)$/; + const rangePattern = /^(\d+)\s*-\s*(\d+)\s+([A-Za-z]+)$/; + const singleHourPattern = /^(\d+)\s+([A-Za-z]+)$/; + + if (rangePattern.test(str)) { + const [, from, to, teacherCode] = str.match(rangePattern); + result.teacher = teacherMap[teacherCode.toLowerCase()]; + result.teacherCode = teacherCode; + result.type = "range"; + result.hours = { from: parseInt(from), to: parseInt(to) }; + } + else if (singleHourPattern.test(str)) { + const [, hour, teacherCode] = str.match(singleHourPattern); + result.teacher = teacherMap[teacherCode.toLowerCase()]; + result.teacherCode = teacherCode; + result.type = "single"; + result.hours = parseInt(hour); + } + else if (wholeDayPattern.test(str)) { + const [, teacherCode] = str.match(wholeDayPattern); + result.teacher = teacherMap[teacherCode.toLowerCase()]; + result.teacherCode = teacherCode; + result.type = "wholeDay"; + } + else { + result.type = "invalid"; + result.original = str; + } + + return result; +} + +module.exports = parseAbsence; diff --git a/scrape/utils/parseTeachers.js b/scrape/utils/parseTeachers.js new file mode 100644 index 0000000..36f865a --- /dev/null +++ b/scrape/utils/parseTeachers.js @@ -0,0 +1,25 @@ +const { default: axios } = require("axios"); +const cheerio = require("cheerio"); + +async function parseTeachers() { + const url = "https://spsejecna.cz/ucitel"; + const { data } = await axios.get(url); + const $ = cheerio.load(data); + + const map = {}; + + $("main .contentLeftColumn li, main .contentRightColumn li").each((_, el) => { + const link = $(el).find("a"); + const href = link.attr("href"); // e.g. "/ucitel/PA" + const text = link.text().trim(); // e.g. "Ing. Bc. Šárka Páltiková" + + if (href) { + const key = href.split("/").pop().toLowerCase(); // get "pa" + map[key] = text; + } + }); + + return map; +} + +module.exports = parseTeachers;