https://github.com/puppeteer/puppeteer
Every now and again I need to connect to a website and then scrape something so I don't have to do a couple of hundred repetitive actions
Enter Puppeteer
You can basically get Chrome to do whatever you want using puppeteer and nodejs.
The js code below does the following:
- Connects to a webpage that has a list of videos
- Grabs the URL for each video sub page
- Loops through each sub page url, loads the page and then pulls the video download url from a dropdown and then selects the 720p video
- Writes the download_link and the name of the video to a file as JSON
const puppeteer = require("puppeteer");
const fs = require("fs");
const url = "https://example.com/path/to/indexpage";
(async () => {
const browser = await puppeteer.launch({
headless: false,
executablePath:
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
});
const downloadUrls = [];
const page = await browser.newPage();
page.on("console", log => console[log._type](log._text));
await Promise.all([
page.goto(url),
page.waitForNavigation({ waitUntil: "networkidle0" })
]);
const elements = await page.$$eval(
"div.synopsisGroup div.synopsis div.syn-body h3 a",
link => {
return link.map(x => {
return { href: x.href, text: x.innerText };
});
}
);
for (i = 0; i < elements.length; i++) {
const newPage = await browser.newPage();
newPage.on("console", log => console[log._type](log._text));
await Promise.all([
newPage.goto(elements[i].href),
newPage.waitForNavigation({ waitUntil: "networkidle0" })
]);
const dd = await newPage.$$eval(
"div.dropdownBody a.secondaryButton[href]",
link => {
links = link.map(x => x.toString());
return links.filter(v => v.indexOf("r720P") !== -1);
}
);
const downloadObject = {
...elements[i],
download_link: dd[0]
};
downloadUrls.push(downloadObject);
console.log(dd[0]);
await newPage.close();
}
await page.screenshot({ path: "example.png" });
await browser.close();
console.log(downloadUrls);
fs.writeFile(
"/Users/jmcd/sites/get-original-song-videos/downloadUrlsNames",
JSON.stringify(downloadUrls),
function(err) {
console.log(err ? "Error :" + err : "ok");
}
);
})();
You could keep scripting puppeteer to download the files aswell but if you write the file urls to disk you can then just use the following to download the videos
cat downloadUrlsNames| jq .data[].download_link | xargs wget
0 Comments