JavaScript Web Scraping and Link Analysis
Discover powerful JavaScript methods for web scraping and link analysis. This guide provides examples for capturing links, emails, images, stylesheets, anchor links, and more. Learn efficient techniques to extract valuable information from web pages and analyze link structures.
Capture All Links
Array.from(document.links).forEach(({ href }) => console.log(href));
Capture All Links (Alternative)
Array.from(document.links, ({ href }) => href).forEach(console.log);
Capture Links Using getElementsByTagName
[...document.getElementsByTagName('a')].forEach(a => console.log(a.href));
Capture Links Using getElementsByTagName (Alternative)
Array.from(document.getElementsByTagName('a'), a => a.href).forEach(url => console.log(url));
Capture Links Using a for Loop
var urls = Array.from(document.getElementsByTagName('a'));
for (var url of urls) {
console.log(url.href);
}
Capture Emails Using Regular Expression
const regex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
const html = document.documentElement.innerHTML;
let match;
while ((match = regex.exec(html))) {
console.log(match[0]);
}
Capture All Images
Array.from(document.images).forEach(({ src }) => console.log(src));
Capture Stylesheets
Array.from(document.styleSheets).forEach(({ href }) => console.log(href));
Capture Internal Links
Array.from(document.links).filter(a => a.hostname === location.hostname).forEach(({ href }) => console.log(href));
Capture External Links
Array.from(document.links).filter(a => a.hostname !== location.hostname).forEach(({ href }) => console.log(href));
Capture Unique URLs
let uniqueURLs = new Set(Array.from(document.links).map(({ href }) => href));
uniqueURLs.forEach(url => console.log(url));
Capture PDF Links
Array.from(document.links).filter(a => a.href.endsWith('.pdf')).forEach(({ href }) => console.log(href));
Capture Download Links
Array.from(document.querySelectorAll('a[download]')).forEach(({ href }) => console.log(href));
Capture Mailto Links
Array.from(document.querySelectorAll('a[href^="mailto:"]')).forEach(({ href }) => console.log(href));
Capture Tel Links
Array.from(document.querySelectorAll('a[href^="tel:"]')).forEach(({ href }) => console.log(href));
Capture Links with Specific Text
Array.from(document.links).filter(a => a.innerText.includes('text')).forEach(({ href }) => console.log(href));
Capture Anchor Links
Array.from(document.querySelectorAll('a[href^="#"]')).forEach(({ href }) => console.log(href));
Capture Chrome Tabs URLs
chrome.tabs.query({}, function(tabs) {
tabs.forEach(tab => console.log(tab.url));
});
Capture Iframe Sources
Array.from(document.querySelectorAll
('iframe')).forEach(({ src }) => console.log(src));
Capture All Images urls to a tab
const images = Array.from(document.images);
const imageUrls = images.map((image) => image.src);
const anchorTags = imageUrls.map((url) => `<a href="${url}" target="_blank">${url}</a>`);
window.open().document.write('<ul>' + anchorTags.map((tag) => `<li>${tag}</li>`).join('') + '</ul>');
Capture all images to a tab and show a preview in a fixed size
const images = Array.from(document.images);
const imageUrls = images.map((image) => image.src);
const anchorTags = imageUrls.map((url) => `<a href="${url}" target="_blank"><img src="${url}" width="50" height="50"></a>`);
const newTab = window.open();
newTab.document.write('<ul style="list-style-type:none; padding: 0;">' + anchorTags.map((tag) => `<li>${tag}</li>`).join('') + '</ul>');
Capture all images to a tab and show a preview in default size
const images = Array.from(document.images);
const imageUrls = images.map((image) => image.src);
const anchorTags = imageUrls.map((url) => `<a href="${url}" target="_blank"><img src="${url}" ></a>`);
const newTab = window.open();
newTab.document.write('<ul style="list-style-type:none; padding: 0;">' + anchorTags.map((tag) => `<li>${tag}</li>`).join('') + '</ul>');
Capture Sources of Elements with src Attributes
var elementsWithSrc = document.querySelectorAll('img, script, iframe');
for (var i = 0; i < elementsWithSrc.length; i++) {
var element = elementsWithSrc[i];
var src = element.getAttribute('src');
if (src) {
console.log(src);
}
}
Web Crawler with Elapsed Time
const crawledUrls = new Set();
const pendingUrls = [window.location.href];
async function crawl() {
const startTime = new Date().getTime();
while (pendingUrls.length) {
const url = pendingUrls.pop();
if (!crawledUrls.has(url)) {
console.log(`Crawling ${url}`);
try {
const response = await fetch(url);
const text = await response.text();
const doc = new DOMParser().parseFromString(text, 'text/html');
const anchors = doc.getElementsByTagName('a');
for (const a of anchors) {
const href = a.href;
if (!crawledUrls.has(href) && !pendingUrls.includes(href)) {
pendingUrls.push(href);
}
}
} catch (e) {
console.error(`Failed to crawl "${url}": ${e}`);
}
crawledUrls.add(url);
}
}
const endTime = new Date().getTime();
const elapsedTime = endTime - startTime;
console.log('Finished crawling', crawledUrls.size, 'URLs');
console.log('Elapsed time:', elapsedTime, 'ms');
}
crawl();