Menu
Portfolio
Blog
Snippets
Login
Confirm
exit_to_app
Close
Entries
library_add
Utility(Event,Mouse): Log Movement X,Y
delete
Utility(txt): Basic Sentence Generator
delete
Utility(Event,Keyboard): Logger
delete
Utility(txt): Expanded Phrase Generator
delete
Scrape(General|Multi): Paginated Crawler
delete
Scrape(Sunbiz|Multi): Get Biz's By Zip
delete
Utility(Time): Test Load Speed
delete
Scrape(General|Single|NLP): Get Keyword Stats
delete
Scrape(General|Single): Get Text Near Links
delete
Utility(Event|Mouse): Press Down,Up, & Dragend
delete
Utility(Number): Between Range
delete
Utility(Event|Window): On Resize
delete
Scrape(Youtube|Single): Crawl Channel Videos
delete
Scrape(General|Multi): Tiny Link Crawler
delete
Utility(Time): Day of Week Report
delete
Utility(Number): Aspect Ratio Size Pairs
delete
Utility(Data): Sort Object Properties
delete
Scrape(General|Single): RiTa Sentences & Words
delete
Scrape(General|Single): Investigate Element Layers
delete
Scrape(NLP|Single): Using Compromise Plugins
delete
Utility(General): Remote Script Loader
delete
Scrape(General|Single|NLP): Compromise RiTa D3
delete
Scrape(General|Multi|Node): Grouped Node Crawler
delete
Scrape(Amazon|Multi): Crawl Product SERPs
delete
Scrape(Amazon|Multi): Get Paginated Brands
delete
Utility (Data): Download JSON in Browser
delete
Utility (Data): JSON AutoTypes
delete
Scrape(YouTube|Single): Video Page
delete
Utility (Text): Make Page Editable
delete
Utility (Text): Article Editor
delete
Scrape(General|Single|Text): Get Text On Click
delete
Utility (File): DnD File Parser (CSV,JSON,TXT)
delete
Scrape(General|Single): Get Links (Examples)
delete
Scrape(General|Single|Text): Get Sentences by Tag
delete
Utility (File): JSON to CSV via D3.js
delete
Scrape(General|Single): Auto Parse by Content Type
delete
Scrape(General|Single): Get Paragraphs & Sentences
delete
Scrape(Amazon|Multi): Get Reviews by ASIN
delete
Scrape(General|Single): Download Images on Page
delete
Utility(Event,Form): Custom Submit Function
delete
Utility (Fetch): Basic API Post Request
delete
Utility (Event,Form,Fetch): Form Data to API
delete
Utility (Time): Async Delay
delete
Utility (Time): Async Repeat Every N Secs
delete
Scrape(coj): Crawl Property SERPs
delete
Utility (Data): Promise Chain
delete
Utility (Fetch): Examples - JSON,Text,HTML
delete
Scrape(Amazon|Single): Product Review NLP
delete
Utility (Nodejs): Streaming Collections
delete
Scrape(Rate My Professor|Multi): Crawl Prof SERPs
delete
Utility (Time): JS Timer
delete
Utility (Text): Proper Case
delete
Scrape(Thingiverse API): Get Things via Search API
delete
Scrape(General|Single|Node): Get Node Attributes
delete
Scrape(General|Single|Node): Node Attributes + Text
delete
Scrape(Thesaurus): Get Words from SERPs
delete
Scrape(Walmart): Crawl Product SERPs
delete
Scrape(free3d): Crawl 3D Model SERPs
delete
Scrape(Aliexpress|Single): Get Products From SERP
delete
Scrape(simplify3d): Crawl Post SERPs
delete
Scrape(Twitter): Crawl Post Feed (infinite scroll)
delete
Scrape(DDuckGo|Single): Get Links from SERP
delete
Scrape(General|Single): Get Tokens String Distance
delete
Scrape(General|Single): Content Report
delete
Scrape(General|Single|Node): Node Recon (CSV)
delete
Utility (File): D3 JSON to CSV
delete
Scrape(coj|Multi): Crawl Property SERPs
delete
Scrape(coj|Single): sidenote
delete
Scrape(General|Single): Recursive Node Crawler
delete
Utility (Event,Window): Scroll to Root ScrollHeight
delete
Scrape(Indeed|Multi): Crawl Job SERPs
delete
Scrape(Thingiverse API): Get Things By Id
delete
Scrape(Thingiverse): Crawl Things by Category
delete
Scrape(Thingiverse API): Get Thing Batches by Id via DnD
delete
Scrape(YouTube|Single): Get Video Playlist
delete
Utility (Data): Join Thing Metrics & Meta
delete
Utility(Data): Get Nested Array Lengths
delete
Utility (Twitter): Hide Followed Profiles
delete
Utility (Time): YYYY-MM-DD HH:MM:SS
delete
Scrape(Thangs|Multi): Crawl 3D Model SERPs
delete
Scrape(PrusaPrints,Multi): Get Prints
delete
Scrape(Reddit,Single): Get Posts
delete
Userscript(Youtube): Scrape Channel Videos
delete
Userscript(Youtube): Tab Manager
delete
Scrape(Sunbiz|Multi): Biz Details
delete
Utility(Data):DnD View Types
delete
Scrape(General|Single|Node): Select Nodes by Attr
delete
Scrape(Aliexpress|Multi) Get Products via API
delete
Utility(Text): Strip Web Page CSS, Script, Events, Media
delete
Scrape(Youtube|Single) Get Subs
delete
Scrape(General|Single): SelectAll ReduceByProp
delete
Scrape(General|Single): SelectAll ReduceMultiProps
delete
Scrape(General|Multi): Tiny Link Crawler + Delay & Node Reports
delete
Scrape(P5|Multi): Get Examples
delete
Scrape(LinkedIn|Single): Find New Connections
delete
UserScript(linkedIn|Single) Get Jobs
delete
Utility (Time): Date From Days Ago
delete
Utility(General|Single) Keep Scrolling
delete
Scrape(YouTube) Videos From Search
delete
Utility(General|Single): getOffset
delete
Utility(Event,Form): Get Data On Form Input
delete
Utility(Event,Element): ResizeObserver
delete
COCO-SSD Object Categories
delete
Scrape(Wikipedia|Multi): What Links Here?
delete
Scrape(DDuckGo|Single): Download Images
delete
Scrape(General|Single|NLP): Compromise nGram
delete
Scrape(General|Multi|Node): Grouped Node Crawler
Edit Snippet
(async () => { //SETTINGS var collection = []; var CRAWLERSETTINGS = { LIMIT: 10, INDEX: 0 } var fetchSettings = { url: document.URL, options: { method: 'GET', // *GET, POST, PUT, DELETE, etc. //cache: 'no-cache', // *default, no-cache, reload, force-cache, only-if-cached credentials: 'same-origin', // include, *same-origin, omit redirect: 'follow', // manual, *follow, error } }; var links = new Set([fetchSettings.url]); //END SETTINGS //Node EXTRACTION FUNCTIONS var extract = (el,t="text",arr) => { //console.log(`extracting data from el`) var getNodeData = (el) => { var text = getNodeText(el); var attributes = getNodeAttributes(el); return {text,...attributes,tag: el.nodeName} } if(t == "text") { var nodeObj = getNodeData(el); // console.log({arr,nodeObj}) Object.keys(nodeObj).length > 0 ? arr.push(nodeObj) : null; } return arr; } const getNodeText = (el,clean=true) => { //select the node text, starting with innerText, then trying other possible alternatives (depends on the selected html element) var text = el.innerText ? el.innerText : el.value ? el.value : el.content ? el.content : ""; return clean ? text.replace(/\.+/gim,".").replace(/\?+/gim,"?").replace(/!+/gim,"!") //clean up multiple puncuation marks .replace(/[ \t]+/gim," ") //remove extra white space, replacing tabs with space .replace(/[\n\r]+/gim,"\n").replace(/( ?\n ?)+/gim,"\n").trim() //remove extra line breaks, replacing returns with new lines : text; //if clean is false, just return the raw text; } const getNodeAttributes = (el) => { try { return el.attributes ? [...el.attributes].reduce((atts, att) => { atts[att.nodeName] = att.value return atts }, {}) : {}; } catch (e) { //if there is an error with selection, then log the error and return an empty array console.log({ e }) return {}; } } //END EXTRACTION FUNCTIONS //GET DOCUMENT var getDoc = async (settings) => { const isValid = (response) => response.ok; //simple check to see if response is good var response = await fetch(settings.url, settings.options); //fetch the doc, await response object return isValid(response) ? await response.text() : "Not Valid"; //return the document text if the reponse is valid } //PARSE DOCUMENT var parseDoc = (docText) => { //input = html string output = virtual dom var parser = new DOMParser(); //new parser instance return parser.parseFromString(docText, "text/html") //return parsed doc } var generateReport = ({link,doc}) => { var domain = doc.domain; var cleanText = (str) => str.replace(/[\s\t\n\r]+/gim, " ").trim(); //utility function to clean whitespace var report = {}; try { var getElData = (selector,type="text") => { return [...doc.querySelectorAll(selector)].reduce((arr,el) => extract(el,type,arr), []); } report = { title: doc.querySelector("title").innerText, url: link, meta: getElData("meta"), resources: [...doc.scripts, ...doc.styleSheets].reduce((arr,el) => extract(el,"text",arr), []), links: [...doc.links].reduce((links, a) => { var linkObject = { text: cleanText(a.innerText), href: a.href }; linkObject.href.indexOf(domain) > -1 ? links.internal.push(linkObject) : links.external.push(linkObject); return links; },{ internal: [], external: [] }), headingText: getElData("h1,h2,h3,h4,h5,h6"), paragraphs: getElData("p"), semanticTags: getElData("article,details,section,summary,figcaption,figure,data,dialog,main"), lists: getElData("ul,ol,li,dir,dl,dt,dd"), io: getElData("form,input,textarea,button,select,optgroup,option,label,fieldset,legend,datalist,output"), formatted: getElData("acronym,abbr,b,bdi,bdo,big,center,code,del,dfn,em,font,i,ins,kbd,mark,meter,pre,progress,rp,rt,ruby,s,samp,small,strike,strong,sub,sup,template,tt,u,var,wbr"), references: getElData("address,blockquote,cite,q,time"), images: getElData("img,map,area,canvas,figcaption,figure,picture,svg"), video: getElData("video"), audio: getElData("audio"), headerFooterAsideNav: getElData("header,footer,nav,aside"), navigation: getElData("a,nav"), divs: getElData("div") } } catch (e) { console.log(e); } return report; } //loop through links. By default, the crawler starts on the current page. for await (let link of links) { CRAWLERSETTINGS.INDEX++; //track the link index fetchSettings.url = link; //replace the old url with a new one. try { //STEP 1 - get document via fetch var docText = await getDoc(fetchSettings); //STEP 2 - parse document via DOM Parser API var doc = await parseDoc(docText); //Step 3 - extract data from doc var report = await generateReport({link,doc}); //STEP 4 - add doc data to bulk collection await collection.push(report); if (CRAWLERSETTINGS.INDEX < CRAWLERSETTINGS.LIMIT) { //STEP 5 - check if limits have been reached await report.links.internal.forEach(link => { //loop through each of the current doc's links so they can be added to the crawl list links.size < CRAWLERSETTINGS.LIMIT ? // ...until limit is reached or search exhausted links.add(link.href) : null; //keep finding & adding links to the crawl list... }); await console.log({ //log progress to the console ...CRAWLERSETTINGS, //limit & index reports: collection.length, //# of page-level reports links: links.size //# of unique links found }); } else { //Check if crawler limits have been reached. var limitReached = CRAWLERSETTINGS.INDEX >= CRAWLERSETTINGS.LIMIT; var linkSizeReached = links.size <= CRAWLERSETTINGS.INDEX; if (limitReached || linkSizeReached) { console.log({limitReached,linkSizeReached,collection}) } } } catch (e) { await console.log({ failedToGetPage: e }); } } // END FOR AWAIT LOOP - link of links })()