curl/curl_basic.js - Documentation

curl/curl_basic.js
//Requires puppeteer, puppeteer-extra-plugin-stealth
{
	if (!global.CURL)
		/**
		 * The namespace for all CURL/Puppeteer utility functions.
		 *
		 * @namespace CURL
		 */
		global.CURL = {};
	
	/**
	 * Adds margins to a screenshot using an HTML5 canvas.
	 * @alias CURL.addMarginsToScreenshot
	 *
	 * @param {Object} arg0_screenshot_buffer - The current screenshot buffer.
	 * @param {Object} arg1_options
	 *  @param {number} arg1_options.height
	 *  @param {number} arg1_options.width
	 *  @param {number} [arg1_options.margin_bottom=0]
	 *  @param {number} [arg1_options.margin_left=0]
	 *  @param {number} [arg1_options.margin_right=0]
	 *  @param {number} [arg1_options.margin_top=0]
	 *
	 * @returns {Buffer}
	 */
	CURL.addMarginsToScreenshot = async function (arg0_screenshot_buffer, arg1_options) {
		//Convert from parameters
		let screenshot_buffer = arg0_screenshot_buffer;
		let options = (arg1_options) ? arg1_options : {};
		
		//Declare local instance variables
		let { createCanvas, loadImage } = require("canvas");
		
		//Create a canvas with specified dimensions
		let canvas = createCanvas(options.width, options.height);
		let ctx = canvas.getContext("2d");
		
		//Fill the canvas with a white background
		ctx.fillStyle = "white";
		ctx.fillRect(0, 0, options.width, options.height);
		
		//Load the screenshot image
		let img = await loadImage(screenshot_buffer);
		
		//Draw the screenshot on the canvas with margins
		ctx.drawImage(img,
			options.margin_left,
			options.margin_top,
			options.width - options.margin_left - options.margin_right,
			options.height - options.margin_top - options.margin_bottom
		);
		
		//Return statement
		return canvas.toBuffer();
	};
	
	/**
	 * Fetches plaintext from specific CSS selectors on a URL.
	 * @alias CURL.getPlaintextFromSelectors
	 *
	 * @param {string} arg0_url
	 * @param {string|string[]} arg1_selectors
	 *
	 * @returns {string}
	 */
	CURL.getPlaintextFromSelectors = async function (arg0_url, arg1_selectors) {
		//Convert from parameters
		let url = arg0_url;
		let selectors = Array.toArray(arg1_selectors);
		
		//Declare local instance variables
		let html = await CURL.getWebsiteHTML(url);
		
		if (html) {
			let dom = new JSDOM.JSDOM(html);
			let plaintext = ``;
			let website_body = dom.window.document.body;
			
			for (let i = 0; i < selectors.length; i++) {
				let local_elements = website_body.querySelectorAll(selectors[i]);
				let local_string = ``;
				
				for (let x = 0; x < local_elements.length; x++)
					local_string += local_elements[x].textContent;
				
				//Append to plaintext
				plaintext += local_string;
			}
			
			//Return statement
			return plaintext;
		} else {
			return "";
		}
	};
	
	/**
	 * Fetches the HTML content of a website, falling back to Puppeteer if needed.
	 * @alias CURL.getWebsiteHTML
	 *
	 * @param {string} arg0_url
	 *
	 * @returns {string}
	 */
	CURL.getWebsiteHTML = async function (arg0_url) {
		//Convert from parameters
		let url = arg0_url;
		
		//Declare local instance variables
		let fetch_html;
		let fetch_website;
		
		//Function body
		try {
			fetch_website = sync_request("GET", url);
			fetch_html = fetch_website.getBody("utf8");
		} catch (e) {
			fetch_html = "";
		}
		
		//Use Chrome profile instead if website HTML could not be fetched normally
		if (!fetch_html) {
			let chrome_instance = await puppeteer.launch();
			let page = await chrome_instance.newPage();
			
			await page.goto(url, { waitUntil: "networkidle2" });
			fetch_html = await page.content();
			await chrome_instance.close();
		}
		
		//Return statement
		return fetch_html;
	};
	
	/**
	 * Fetches all anchor links from a URL with filtering options.
	 * @alias CURL.getWebsiteLinks
	 *
	 * @param {string} arg0_url
	 * @param {Object} [arg1_options]
	 *  @param {string[]} [arg1_options.allowed_domains]
	 *  @param {string[]} [arg1_options.exclude_domains]
	 *  @param {number} [arg1_options.attempts=1]
	 *  @param {number} [arg1_options.max_attempts=15]
	 *
	 * @returns {string[]}
	 */
	CURL.getWebsiteLinks = async function (arg0_url, arg1_options) {
		//Convert from parameters
		let url = arg0_url;
		let options = (arg1_options) ? arg1_options : {};
		
		//Initialise options
		if (options.attempts === undefined) options.attempts = 1;
		if (options.max_attempts === undefined) options.max_attempts = 15;
		
		//Declare local instance variables
		let attempt_to_reconnect = false;
		let html = await CURL.getWebsiteHTML(url);
		let links = [];
		
		if (html) {
			let dom = new JSDOM.JSDOM(html);
			let website_body = dom.window.document.body;
			let all_link_els = website_body.querySelectorAll(`a`);
			
			for (let i = 0; i < all_link_els.length; i++) {
				try {
					let local_href = new URL(all_link_els[i].getAttribute("href"), url).href;
					if (local_href) links.push(local_href);
				} catch (e) {}
			}
			
			//If there's no all_link_els, try again
			if (links.length === 0) attempt_to_reconnect = true;
			
			//Filter links if options.allowed_domains is defined
			if (options.allowed_domains) {
				let processed_links = [];
				
				for (let i = 0; i < links.length; i++)
					for (let x = 0; x < options.allowed_domains.length; x++)
						if (links[i].includes(options.allowed_domains[x])) {
							let link_allowed = true;
							
							if (options.exclude_domains)
								for (let y = 0; y < options.exclude_domains.length; y++)
									if (links[i].includes(options.exclude_domains[y]))
										link_allowed = false;
							
							if (link_allowed) processed_links.push(links[i]);
							break;
						}
				
				links = processed_links;
			}
		} else {
			attempt_to_reconnect = true;
		}
		
		if (attempt_to_reconnect && options.attempts < options.max_attempts) {
			let random_delay = Math.randomNumber(500, 10000);
			
			console.log(`Attempt ${options.attempts} failed to fetch links. Retrying after ${random_delay}ms ..`);
			options.attempts++;
			
			await CURL.sleep(random_delay);
			return await CURL.getWebsiteLinks(url, options);
		}
		
		//Return statement
		return Array.unique(links);
	};
	
	/**
	 * Fetches and returns a stripped plaintext version of a website.
	 * @alias CURL.getWebsitePlaintext
	 *
	 * @param {string} arg0_url
	 *
	 * @returns {string}
	 */
	CURL.getWebsitePlaintext = async function (arg0_url) {
		//Convert from parameters
		let url = arg0_url;
		
		//Declare local instance variables
		let fetch_html = await CURL.getWebsiteHTML(url);
		
		//Return statement
		if (fetch_html)
			return CURL.stripHTML(fetch_html);
	};
	
	/**
	 * Generates a plaintext string dump from specific scraping rules.
	 * @alias CURL.generatePlaintext
	 *
	 * @param {Object} arg0_options
	 *  @param {boolean} [arg0_options.cache=false] - Whether to cache the current dump.
	 *  @param {string} [arg0_options.cache_folder='./cache/']
	 *  @param {string} [arg0_options.cache_prefix='']
	 *  @param {Object[]} arg0_options.scrape_urls
	 *
	 * @returns {string}
	 */
	CURL.generatePlaintext = async function (arg0_options) {
		//Convert from parameters
		let options = (arg0_options) ? arg0_options : {};
		
		//Initialise options
		if (!options.cache_folder) options.cache_folder = `./cache/`;
		if (!options.cache_prefix) options.cache_prefix = "";
		
		//Declare local instance variables
		let scrape_urls = (options.scrape_urls) ? Array.toArray(options.scrape_urls) : [];
		let string = ``;
		
		//Iterate over scrape_urls and parse websites
		for (let i = 0; i < scrape_urls.length; i++)
			string += await CURL.generatePlaintextRecursively(JSON.parse(JSON.stringify(scrape_urls[i])));
		
		//Cache to /cache if options.cache is true
		if (options.cache) {
			let cache_file_name = `${options.cache_folder}${options.cache_prefix}${ABRS.returnDateString()}.txt`;
			CURL.writeTextFile(cache_file_name, string);
		}
		
		//Return statement
		return string;
	};
	
	/**
	 * Helper function for generatePlaintext() to handle recursion and DOM traversal.
	 * @alias CURL.generatePlaintextRecursively
	 *
	 * @param {Object} arg0_options
	 *
	 * @returns {string}
	 */
	CURL.generatePlaintextRecursively = async function (arg0_options) {
		//Convert from parameters
		let options = (arg0_options) ? arg0_options : {};
		
		//Initialise options
		if (options.depth === undefined) options.depth = 0;
		if (options.crawled_pages === undefined) options.crawled_pages = [];
		
		if (options.selectors[options.depth] === undefined) options.selectors[options.depth] = {};
		
		//Declare local instance variables
		let selectors = options.selectors[options.depth];
		let string = ``;
		let website_html = await CURL.getWebsiteHTML(options.url);
		
		let dom = new JSDOM.JSDOM(website_html);
		let website_body = dom.window.document.body;
		
		console.log(`Scraping`, options.url);
		
		//1. Scrape iframes handler (replace HTML)
		if (selectors.scrape_iframes) {
			let all_iframe_els = website_body.querySelectorAll(`iframe`);
			
			for (let i = 0; i < all_iframe_els.length; i++)
				try {
					let local_href = new URL(all_iframe_els[i].getAttribute("href"), options.url).href;
					
					if (local_href)
						all_iframe_els[i].outerHTML = await CURL.getWebsiteHTML(local_href);
				} catch (e) {}
		}
		
		//2. Exclude elements from website DOM first
		if (selectors.exclude)
			for (let i = 0; i < selectors.exclude.length; i++) {
				let local_els = website_body.querySelectorAll(selectors.exclude[i]);
				for (let x = 0; x < local_els.length; x++)
					local_els[x].remove();
			}
		
		//3. Fetch include_els for later processing
		let include_els_html = [];
		
		if (selectors.include) {
			for (let i = 0; i < selectors.include.length; i++) {
				let local_els = website_body.querySelectorAll(selectors.include[i]);
				
				for (let x = 0; x < local_els.length; x++)
					if (!include_els_html.includes(local_els[x].outerHTML)) {
						include_els_html.push(local_els[x].outerHTML);
					}
			}
		} else {
			include_els_html.push(website_body.outerHTML);
		}
		
		//4. Strip HTML
		for (let i = 0; i < include_els_html.length; i++)
			string += CURL.stripHTML(include_els_html[i]) + "\n";
		
		//5. Recursive depth handler
		if (options.depth < options.recursive_depth) {
			let current_website_links = await CURL.getWebsiteLinks(options.url, {
				allowed_domains: options.recursive_links,
				exclude_domains: options.recursive_exclude_links
			});
			
			for (let i = 0; i < current_website_links.length; i++)
				if (!options.crawled_pages.includes(current_website_links[i])) {
					let new_options = JSON.parse(JSON.stringify(options));
					new_options.crawled_pages.push(options.url);
					new_options.url = current_website_links[i];
					new_options.depth++;
					
					let local_page_plaintext = await CURL.generatePlaintextRecursively(new_options);
					if (local_page_plaintext) string += local_page_plaintext;
				}
		}
		
		//Return statement
		return string;
	};
	
	/**
	 * Strips HTML tags and excessive whitespace from a string.
	 * @alias CURL.stripHTML
	 *
	 * @param {string} arg0_html
	 *
	 * @returns {string}
	 */
	CURL.stripHTML = function (arg0_html) {
		//Convert from parameters
		let html = arg0_html;
		
		//Declare local instance variables
		let dom = new JSDOM.JSDOM(html);
		
		if (dom) {
			let website_body = dom.window.document.body;
			let remove_elements = website_body.querySelectorAll(`script, style`);
			
			for (let i = 0; i < remove_elements.length; i++)
				remove_elements[i].remove();
			
			let plaintext = website_body.textContent || "";
			let pt_lines = plaintext.split("\n");
			let pt_formatted = pt_lines.map((line) => line.trim())
			.filter((line) => line.length > 0).join("\n");
			
			//Return statement
			return pt_formatted.trim();
		}
	};
	
	/**
	 * Writes a text file to a specified path.
	 * @alias CURL.writeTextFile
	 *
	 * @param {string} arg0_filepath
	 * @param {string} arg1_text
	 */
	CURL.writeTextFile = function (arg0_filepath, arg1_text) {
		//Convert from parameters
		let file_path = arg0_filepath;
		let text = arg1_text;
		
		//Write to file
		try {
			fs.writeFileSync(file_path, text);
		} catch (e) {
			console.error(e);
		}
	};
}