JavaScript Web Scraping and Link Analysis

Discover powerful JavaScript methods for web scraping and link analysis. This guide provides examples for capturing links, emails, images, stylesheets, anchor links, and more. Learn efficient techniques to extract valuable information from web pages and analyze link structures.

Capture All Links

Array.from(document.links).forEach(({ href }) => console.log(href));

Capture All Links (Alternative)

Array.from(document.links, ({ href }) => href).forEach(console.log);

Capture Links Using getElementsByTagName

[...document.getElementsByTagName('a')].forEach(a => console.log(a.href));

Capture Links Using getElementsByTagName (Alternative)

Array.from(document.getElementsByTagName('a'), a => a.href).forEach(url => console.log(url));

Capture Links Using a for Loop

var urls = Array.from(document.getElementsByTagName('a'));
for (var url of urls) {
    console.log(url.href);
}

Capture Emails Using Regular Expression

const regex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
const html = document.documentElement.innerHTML;
let match;
while ((match = regex.exec(html))) {
  console.log(match[0]);
}

Capture All Images

Array.from(document.images).forEach(({ src }) => console.log(src));

Capture Stylesheets

Array.from(document.styleSheets).forEach(({ href }) => console.log(href));

Capture Internal Links

Array.from(document.links).filter(a => a.hostname === location.hostname).forEach(({ href }) => console.log(href));

Capture External Links

Array.from(document.links).filter(a => a.hostname !== location.hostname).forEach(({ href }) => console.log(href));

Capture Unique URLs

let uniqueURLs = new Set(Array.from(document.links).map(({ href }) => href));
uniqueURLs.forEach(url => console.log(url));

Capture PDF Links

Array.from(document.links).filter(a => a.href.endsWith('.pdf')).forEach(({ href }) => console.log(href));

Capture Download Links

Array.from(document.querySelectorAll('a[download]')).forEach(({ href }) => console.log(href));

Capture Mailto Links

Array.from(document.querySelectorAll('a[href^="mailto:"]')).forEach(({ href }) => console.log(href));

Capture Tel Links

Array.from(document.querySelectorAll('a[href^="tel:"]')).forEach(({ href }) => console.log(href));

Capture Links with Specific Text

Array.from(document.links).filter(a => a.innerText.includes('text')).forEach(({ href }) => console.log(href));

Capture Anchor Links

Array.from(document.querySelectorAll('a[href^="#"]')).forEach(({ href }) => console.log(href));

Capture Chrome Tabs URLs

chrome.tabs.query({}, function(tabs) {
  tabs.forEach(tab => console.log(tab.url));
});

Capture Iframe Sources

Array.from(document.querySelectorAll

('iframe')).forEach(({ src }) => console.log(src));

Capture All Images urls to a tab

const images = Array.from(document.images);
const imageUrls = images.map((image) => image.src);
const anchorTags = imageUrls.map((url) => `<a href="${url}" target="_blank">${url}</a>`);
window.open().document.write('<ul>' + anchorTags.map((tag) => `<li>${tag}</li>`).join('') + '</ul>');

Capture all images to a tab and show a preview in a fixed size

const images = Array.from(document.images);
const imageUrls = images.map((image) => image.src);
const anchorTags = imageUrls.map((url) => `<a href="${url}" target="_blank"><img src="${url}" width="50" height="50"></a>`);
const newTab = window.open();
newTab.document.write('<ul style="list-style-type:none; padding: 0;">' + anchorTags.map((tag) => `<li>${tag}</li>`).join('') + '</ul>');

Capture all images to a tab and show a preview in default size

const images = Array.from(document.images);
const imageUrls = images.map((image) => image.src);
const anchorTags = imageUrls.map((url) => `<a href="${url}" target="_blank"><img src="${url}" ></a>`);
const newTab = window.open();
newTab.document.write('<ul style="list-style-type:none; padding: 0;">' + anchorTags.map((tag) => `<li>${tag}</li>`).join('') + '</ul>');

Capture Sources of Elements with src Attributes

var elementsWithSrc = document.querySelectorAll('img, script, iframe');
for (var i = 0; i < elementsWithSrc.length; i++) {
  var element = elementsWithSrc[i];
  var src = element.getAttribute('src');
  if (src) {
    console.log(src);
  }
}

Web Crawler with Elapsed Time

const crawledUrls = new Set();
const pendingUrls = [window.location.href];

async function crawl() {
  const startTime = new Date().getTime();
  
  while (pendingUrls.length) {
    const url = pendingUrls.pop();
    if (!crawledUrls.has(url)) {
      console.log(`Crawling ${url}`);
      try {
        const response = await fetch(url);
        const text = await response.text();
        const doc = new DOMParser().parseFromString(text, 'text/html');
        const anchors = doc.getElementsByTagName('a');
        for (const a of anchors) {
          const href = a.href;
          if (!crawledUrls.has(href) && !pendingUrls.includes(href)) {
            pendingUrls.push(href);
          }
        }
      } catch (e) {
        console.error(`Failed to crawl "${url}": ${e}`);
      }
      crawledUrls.add(url);
    }
  }
  
  const endTime = new Date().getTime();
  const elapsedTime = endTime - startTime;
  console.log('Finished crawling', crawledUrls.size, 'URLs');
  console.log('Elapsed time:', elapsedTime, 'ms');
}

crawl();