Automate Chrome with Puppeteer

by James McDonald | Feb 19, 2020 | IT Tips | 0 comments

Every now and again I need to connect to a website and then scrape something so I don't have to do a couple of hundred repetitive actions

Enter Puppeteer

You can basically get Chrome to do whatever you want using puppeteer and nodejs.

The js code below does the following:

Connects to a webpage that has a list of videos
Grabs the URL for each video sub page
Loops through each sub page url, loads the page and then pulls the video download url from a dropdown and then selects the 720p video
Writes the download_link and the name of the video to a file as JSON

const puppeteer = require("puppeteer");
const fs = require("fs");
 
const url = "https://example.com/path/to/indexpage";
 
(async () => {
  const browser = await puppeteer.launch({
    headless: false,
    executablePath:
      "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
  });
 
  const downloadUrls = [];
 
  const page = await browser.newPage();
  page.on("console", log => console[log._type](log._text));
 
  await Promise.all([
    page.goto(url),
    page.waitForNavigation({ waitUntil: "networkidle0" })
  ]);
 
  const elements = await page.$$eval(
    "div.synopsisGroup div.synopsis div.syn-body h3 a",
    link => {
      return link.map(x => {
        return { href: x.href, text: x.innerText };
      });
    }
  );
 
  for (i = 0; i < elements.length; i++) {
    const newPage = await browser.newPage();
    newPage.on("console", log => console[log._type](log._text));
 
    await Promise.all([
      newPage.goto(elements[i].href),
      newPage.waitForNavigation({ waitUntil: "networkidle0" })
    ]);
 
    const dd = await newPage.$$eval(
      "div.dropdownBody a.secondaryButton[href]",
      link => {
        links = link.map(x => x.toString());
        return links.filter(v => v.indexOf("r720P") !== -1);
      }
    );
    const downloadObject = {
      ...elements[i],
      download_link: dd[0]
    };
 
    downloadUrls.push(downloadObject);
 
    console.log(dd[0]);
 
    await newPage.close();
  }
 
  await page.screenshot({ path: "example.png" });
 
  await browser.close();
  console.log(downloadUrls);
 
  fs.writeFile(
    "/Users/jmcd/sites/get-original-song-videos/downloadUrlsNames",
    JSON.stringify(downloadUrls),
    function(err) {
      console.log(err ? "Error :" + err : "ok");
    }
  );
})();

You could keep scripting puppeteer to download the files aswell but if you write the file urls to disk you can then just use the following to download the videos

1	`cat` `downloadUrlsNames\| jq .data[].download_link \|` `xargs` `wget`

Automate Chrome with Puppeteer

0 Comments

Submit a Comment Cancel reply