The script uses puppeteer webscrapper to download a zip file from the webpage.
This file is downloaded to a folder on the desktop.
Now when i create a child job from that folder, the output job is the downloaded zip file inside a job folder called "zip", since i am using that path in the createChild function.
Is this expected behaviour, since i am not downloading to a temp path? I tried to use tmp similar as in here https://github.com/EnfocusSW/webShop/blob/main/main.ts but then the function waitUntilDownload fails and the download is cancelled.
I can remove the folder with job dismantler and everything is ok, but i was just wondering if there is a way to get the downloaded zip file without the folder as a child job?
Code: Select all
import { time } from "console";
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
const { execSync } = require('child_process');
async function jobArrived(s: Switch, flowElement: FlowElement, job: Job) {
//Get access to incoming switch job
let inputjob = await job.get(AccessLevel.ReadWrite);
//Launch Puppeteer browser
const browser = await puppeteer.launch({
headless:false,
defaultViewport: null,
});
try {
//Create new page
const page = await browser.newPage();
// Read HTML file and extract URL
const htmlContent = fs.readFileSync(inputjob, 'utf8');
const urlMatch = htmlContent.match(/https?:\/\/[^\s"'<>]+/);
const url = urlMatch ? urlMatch[0] : null;
//await job.log(LogLevel.Info, url);
if (!url) {
await job.log(LogLevel.Error, 'No URL found in HTML document. Url is ' + url);
await browser.close();
return;
}
// Set download path to desktop/ZIP folder
const desktopPath = path.join(require('os').homedir(), 'Desktop');
const zipFolder = path.join(desktopPath, 'ZIP');
if (!fs.existsSync(zipFolder)) {
fs.mkdirSync(zipFolder, { recursive: true });
}
// Configure download behavior - allow
await page._client().send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: zipFolder
});
// Navigate to URL
await page.goto(url);
//wait for button
await page.waitForSelector('body > app-root > div > span > app-header > div > nav > div.download-project', { timeout: 10000 });
// Click the "Download Project" button
await page.click('body > app-root > div > span > app-header > div > nav > div.download-project');
// Wait for download to complete
async function waitUntilDownload(page: any, fileName = '', timeoutMs = 60000) {
return Promise.race([
new Promise((resolve, reject) => {
page._client().on('Page.downloadProgress', (e: any) => {
if (e.state === 'completed') {
resolve(fileName);
} else if (e.state === 'canceled') {
reject(new Error('Download canceled'));
}
});
}),
new Promise((_, reject) => {
setTimeout(() => {
// ensure browser is closed on timeout, then reject
browser.close().catch(() => {}).finally(() => {
reject(new Error(`Download timeout after ${timeoutMs}ms`));
});
}, timeoutMs);
})
]);
}
await waitUntilDownload(page, '', 60000);
const newjob = await job.createChild(zipFolder)
await newjob.sendToData(Connection.Level.Success);
await job.sendToNull();
// Remove the downloaded file
const files = fs.readdirSync(zipFolder);
for (const file of files) {
const filePath = path.join(zipFolder, file);
fs.unlinkSync(filePath);
}
//Close browser
await browser.close();
//await job.log(LogLevel.Info, 'File downloaded to:' + zipFolder);
}
catch (err) {
const errorMessage = err instanceof Error ? err.message : String(err);
await job.log(LogLevel.Error, errorMessage);
await browser.close();
await job.sendToData(Connection.Level.Warning);
}
}