Automate Chrome with Puppeteer

by James McDonald | Feb 19, 2020 | IT Tips | 0 comments

← glabels-3-batch to /dev/stdout to LPR PrinterWhat is the last supported version of Firefox for Windows XP →

Every now and again I need to connect to a website and then scrape something so I don't have to do a couple of hundred repetitive actions

Enter Puppeteer

You can basically get Chrome to do whatever you want using puppeteer and nodejs.

The js code below does the following:

Connects to a webpage that has a list of videos
Grabs the URL for each video sub page
Loops through each sub page url, loads the page and then pulls the video download url from a dropdown and then selects the 720p video
Writes the download_link and the name of the video to a file as JSON

const puppeteer = require("puppeteer");
const fs = require("fs");

const url = "https://example.com/path/to/indexpage";

(async () => {
  const browser = await puppeteer.launch({
    headless: false,
    executablePath:
      "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
  });

  const downloadUrls = [];

  const page = await browser.newPage();
  page.on("console", log => console[log._type](log._text));

  await Promise.all([
    page.goto(url),
    page.waitForNavigation({ waitUntil: "networkidle0" })
  ]);

  const elements = await page.$$eval(
    "div.synopsisGroup div.synopsis div.syn-body h3 a",
    link => {
      return link.map(x => {
        return { href: x.href, text: x.innerText };
      });
    }
  );

  for (i = 0; i < elements.length; i++) {
    const newPage = await browser.newPage();
    newPage.on("console", log => console[log._type](log._text));

    await Promise.all([
      newPage.goto(elements[i].href),
      newPage.waitForNavigation({ waitUntil: "networkidle0" })
    ]);

    const dd = await newPage.$$eval(
      "div.dropdownBody a.secondaryButton[href]",
      link => {
        links = link.map(x => x.toString());
        return links.filter(v => v.indexOf("r720P") !== -1);
      }
    );
    const downloadObject = {
      ...elements[i],
      download_link: dd[0]
    };

    downloadUrls.push(downloadObject);

    console.log(dd[0]);

    await newPage.close();
  }

  await page.screenshot({ path: "example.png" });

  await browser.close();
  console.log(downloadUrls);

  fs.writeFile(
    "/Users/jmcd/sites/get-original-song-videos/downloadUrlsNames",
    JSON.stringify(downloadUrls),
    function(err) {
      console.log(err ? "Error :" + err : "ok");
    }
  );
})();

You could keep scripting puppeteer to download the files aswell but if you write the file urls to disk you can then just use the following to download the videos

cat downloadUrlsNames| jq .data[].download_link | xargs wget

Automate Chrome with Puppeteer

0 Comments

Submit a Comment Cancel reply