Automate Chrome with Puppeteer

https://github.com/puppeteer/puppeteer Every now and again…

About this Blog

Login

Blog History

Every now and again I need to connect to a website and then scrape something so I don't have to do a couple of hundred repetitive actions

Enter Puppeteer

You can basically get Chrome to do whatever you want using puppeteer and nodejs.

The js code below does the following:

Connects to a webpage that has a list of videos
Grabs the URL for each video sub page
Loops through each sub page url, loads the page and then pulls the video download url from a dropdown and then selects the 720p video
Writes the download_link and the name of the video to a file as JSON

const puppeteer = require("puppeteer");
const fs = require("fs");

const url = "https://example.com/path/to/indexpage";

(async () => {
  const browser = await puppeteer.launch({
    headless: false,
    executablePath:
      "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
  });

  const downloadUrls = [];

  const page = await browser.newPage();
  page.on("console", log => console[log._type](log._text));

  await Promise.all([
    page.goto(url),
    page.waitForNavigation({ waitUntil: "networkidle0" })
  ]);

  const elements = await page.$$eval(
    "div.synopsisGroup div.synopsis div.syn-body h3 a",
    link => {
      return link.map(x => {
        return { href: x.href, text: x.innerText };
      });
    }
  );

  for (i = 0; i < elements.length; i++) {
    const newPage = await browser.newPage();
    newPage.on("console", log => console[log._type](log._text));

    await Promise.all([
      newPage.goto(elements[i].href),
      newPage.waitForNavigation({ waitUntil: "networkidle0" })
    ]);

    const dd = await newPage.$$eval(
      "div.dropdownBody a.secondaryButton[href]",
      link => {
        links = link.map(x => x.toString());
        return links.filter(v => v.indexOf("r720P") !== -1);
      }
    );
    const downloadObject = {
      ...elements[i],
      download_link: dd[0]
    };

    downloadUrls.push(downloadObject);

    console.log(dd[0]);

    await newPage.close();
  }

  await page.screenshot({ path: "example.png" });

  await browser.close();
  console.log(downloadUrls);

  fs.writeFile(
    "/Users/jmcd/sites/get-original-song-videos/downloadUrlsNames",
    JSON.stringify(downloadUrls),
    function(err) {
      console.log(err ? "Error :" + err : "ok");
    }
  );
})();

You could keep scripting puppeteer to download the files aswell but if you write the file urls to disk you can then just use the following to download the videos

cat downloadUrlsNames| jq .data[].download_link | xargs wget

Automate Chrome with Puppeteer

About this Blog

Login

Blog History

0 Comments

Submit a Comment Cancel reply

Automate Chrome with Puppeteer

About this Blog

Login

Blog History

Tag Cloud

0 Comments

Submit a Comment Cancel reply