Automate Chrome with Puppeteer

by | Feb 19, 2020 | IT Tips | 0 comments

https://github.com/puppeteer/puppeteer

Every now and again I need to connect to a website and then scrape something so I don't have to do a couple of hundred repetitive actions

Enter Puppeteer

You can basically get Chrome to do whatever you want using puppeteer and nodejs.

The js code below does the following:

  1. Connects to a webpage that has a list of videos
  2. Grabs the URL for each video sub page
  3. Loops through each sub page url, loads the page and then pulls the video download url from a dropdown and then selects the 720p video
  4. Writes the download_link and the name of the video to a file as JSON
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
const puppeteer = require("puppeteer");
const fs = require("fs");
 
 
(async () => {
  const browser = await puppeteer.launch({
    headless: false,
    executablePath:
      "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
  });
 
  const downloadUrls = [];
 
  const page = await browser.newPage();
  page.on("console", log => console[log._type](log._text));
 
  await Promise.all([
    page.goto(url),
    page.waitForNavigation({ waitUntil: "networkidle0" })
  ]);
 
  const elements = await page.$$eval(
    "div.synopsisGroup div.synopsis div.syn-body h3 a",
    link => {
      return link.map(x => {
        return { href: x.href, text: x.innerText };
      });
    }
  );
 
  for (i = 0; i < elements.length; i++) {
    const newPage = await browser.newPage();
    newPage.on("console", log => console[log._type](log._text));
 
    await Promise.all([
      newPage.goto(elements[i].href),
      newPage.waitForNavigation({ waitUntil: "networkidle0" })
    ]);
 
    const dd = await newPage.$$eval(
      "div.dropdownBody a.secondaryButton[href]",
      link => {
        links = link.map(x => x.toString());
        return links.filter(v => v.indexOf("r720P") !== -1);
      }
    );
    const downloadObject = {
      ...elements[i],
      download_link: dd[0]
    };
 
    downloadUrls.push(downloadObject);
 
    console.log(dd[0]);
 
    await newPage.close();
  }
 
  await page.screenshot({ path: "example.png" });
 
  await browser.close();
  console.log(downloadUrls);
 
  fs.writeFile(
    "/Users/jmcd/sites/get-original-song-videos/downloadUrlsNames",
    JSON.stringify(downloadUrls),
    function(err) {
      console.log(err ? "Error :" + err : "ok");
    }
  );
})();

You could keep scripting puppeteer to download the files aswell but if you write the file urls to disk you can then just use the following to download the videos

1
cat downloadUrlsNames| jq .data[].download_link | xargs wget

0 Comments

Submit a Comment

Your email address will not be published. Required fields are marked *

This site is protected by reCAPTCHA and the Google Privacy Policy and Terms of Service apply.

The reCAPTCHA verification period has expired. Please reload the page.