Last active
December 21, 2017 15:10
-
-
Save binux/fb853cd4f4d699d178e66c699b58be68 to your computer and use it in GitHub Desktop.
generate css selector for data extraction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
javascript:(function (){document.getElementsByTagName('head')[0].appendChild(document.createElement('script')).src='https://gistcdn.githack.com/binux/fb853cd4f4d699d178e66c699b58be68/raw/css_selector_helper.js';}()); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: | |
// Author: Binux<[email protected]> | |
// http://binux.me | |
// Created on 2013-11-11 18:50:58 | |
if (window._css_selector_helper) { | |
window._css_selector_helper.detach(); | |
} else { | |
function arrayEquals(a, b) { | |
if (!a || !b) | |
return false; | |
if (a.length != b.length) | |
return false; | |
for (var i = 0, l = a.length; i < l; i++) { | |
if (a[i] !== b[i]) | |
return false; | |
} | |
return true; | |
} | |
function getOffset(elem) { | |
var top = 0; | |
var left = 0; | |
do { | |
if ( !isNaN( elem.offsetLeft) ) left += elem.offsetLeft; | |
if ( !isNaN( elem.offsetTop) ) top += elem.offsetTop; | |
} while( elem = elem.offsetParent ) | |
return {top: top, left: left}; | |
} | |
function merge_name(features) { | |
var element_name = ''; | |
features.forEach(function(f) { | |
if (f.selected) | |
element_name += f.name; | |
}) | |
return element_name; | |
} | |
function merge_pattern(path, end) { | |
var patterns = []; | |
var prev = null; | |
path.forEach(function(p, i) { | |
if (end >= 0 && i > end) { | |
return; | |
} | |
if (p.invalid) { | |
prev = null; | |
} else if (p.selected) { | |
if (prev) { | |
patterns.push('>'); | |
} | |
var element_pattern = ''; | |
p.features.forEach(function(f) { | |
if (f.selected) { | |
element_pattern += f.pattern; | |
} | |
}); | |
if (element_pattern === '') { | |
element_pattern = '*'; | |
} | |
patterns.push(element_pattern); | |
prev = p; | |
} else { | |
prev = null; | |
} | |
}) | |
if (patterns.length === 0) { | |
return '*'; | |
} | |
return patterns.join(' '); | |
} | |
function path_info(doc, element) { | |
var path = []; | |
do { | |
var features = []; | |
// tagName | |
features.push({ | |
name: element.tagName.toLowerCase(), | |
pattern: element.tagName.toLowerCase(), | |
selected: true, | |
}); | |
// id | |
if (element.getAttribute('id')) { | |
features.push({ | |
name: '#'+CSS.escape(element.getAttribute('id')), | |
pattern: '#'+CSS.escape(element.getAttribute('id')), | |
selected: true, | |
}); | |
} | |
// class | |
if (element.classList.length > 0) { | |
for (var i=0; i<element.classList.length; i++) { | |
var class_name = element.classList[i]; | |
features.push({ | |
name: '.'+CSS.escape(class_name), | |
pattern: '.'+CSS.escape(class_name), | |
selected: CSS.escape(class_name) == class_name, | |
}); | |
} | |
} | |
// rel, property | |
var allowed_attr_names = ('rel', 'property', 'itemprop'); | |
for (var i=0, attrs = element.attributes; i < attrs.length; i++) { | |
if (allowed_attr_names.indexOf(attrs[i].nodeName) == -1) { | |
continue | |
} | |
features.push({ | |
name: '['+attrs[i].nodeName+'='+CSS.escape(attrs[i].nodeValue)+']', | |
pattern: '['+attrs[i].nodeName+'='+CSS.escape(attrs[i].nodeValue)+']', | |
selected: true, | |
}); | |
} | |
// get xpath | |
var siblings = element.parentNode.childNodes; | |
var xpath = element.tagName.toLowerCase(); | |
for (var i=0, ix=0; siblings.length > 1 && i < siblings.length; i++) { | |
var sibling = siblings[i]; | |
if (sibling === element) { | |
xpath += '['+(ix+1)+']'; | |
break; | |
} else if (sibling.tagName == element.tagName) { | |
ix++; | |
} | |
} | |
// pack it up | |
path.push({ | |
tag: element.tagName.toLowerCase(), | |
name: merge_name(features), | |
xpath: xpath, | |
selected: true, | |
invalid: element.tagName.toLowerCase() === 'tbody', | |
features: features, | |
}); | |
} while (element = element.parentElement); | |
path.reverse(); | |
// select elements | |
var selected_elements = doc.querySelectorAll(merge_pattern(path)); | |
path.forEach(function(p, i) { | |
if (p.invalid) | |
return; | |
// select features | |
var feature_selected_elements = doc.querySelectorAll(merge_pattern(path, i)); | |
p.features.forEach(function(f, fi) { | |
f.selected = false; | |
if (arrayEquals(feature_selected_elements, | |
doc.querySelectorAll(merge_pattern(path, i)))) { | |
return; | |
} | |
f.selected = true; | |
}); | |
if (p.features.every(function(f) { | |
return !f.selected; | |
})) { | |
p.features[0].selected = true; | |
} | |
p.name = merge_name(p.features); | |
}); | |
path.forEach(function(p, i) { | |
p.selected = false; | |
if (arrayEquals(selected_elements, | |
doc.querySelectorAll(merge_pattern(path)))) { | |
p.name = p.tag; | |
return; | |
} | |
p.selected = true; | |
}); | |
return path; | |
} | |
class CSSSelectorHelperServer { | |
constructor(window) { | |
this.window = window; | |
this.document = window.document; | |
this.mouseover_listener = (ev) => { | |
this.overlay(ev.target); | |
}; | |
this.document.addEventListener("mouseover", this.mouseover_listener); | |
this.click_listener = (ev) => { | |
ev.preventDefault(); | |
ev.stopPropagation(); | |
const selector = merge_pattern(path_info(this.document, ev.target)); | |
this.heightlight(document.querySelectorAll(selector)); | |
window.setTimeout(() => { window.prompt('selector_helper_click', selector) }); | |
return false; | |
}; | |
this.document.addEventListener("click", this.click_listener, true); | |
} | |
detach() { | |
[...this.document.querySelectorAll('.pyspider_highlight')].forEach((elem) => { | |
elem.remove(); | |
}); | |
[...this.document.querySelectorAll('.pyspider_overlay')].forEach((elem) => { | |
elem.remove(); | |
}); | |
this.document.removeEventListener("click", this.click_listener, true); | |
this.document.removeEventListener("mouseover", this.mouseover_listener); | |
} | |
overlay(elements) { | |
if (typeof elements === 'string') { | |
elements = this.document.querySelectorAll(elements); | |
} | |
if (elements instanceof this.window.Element) { | |
elements = [elements]; | |
} | |
[...this.document.querySelectorAll('.pyspider_overlay')].forEach((elem) => { | |
elem.remove(); | |
}); | |
[...elements].forEach((elem) => { | |
const offset = getOffset(elem); | |
const div = this.document.createElement("div"); | |
div.className = "pyspider_overlay"; | |
div.setAttribute('style', 'z-index: 999999;background-color: rgba(255, 165, 0, 0.3);position: absolute;pointer-events: none;' | |
+'top: '+offset.top+'px;' | |
+'left:'+offset.left+'px;' | |
+'width: '+elem.offsetWidth+'px;' | |
+'height: '+elem.offsetHeight+'px;'); | |
this.document.body.appendChild(div); | |
}); | |
} | |
heightlight(elements) { | |
if (typeof elements === 'string') { | |
elements = this.document.querySelectorAll(elements); | |
} | |
console.log(elements); | |
if (elements instanceof this.window.Element) { | |
elements = [elements]; | |
} | |
[...this.document.querySelectorAll('.pyspider_highlight')].forEach((elem) => { | |
elem.remove(); | |
}); | |
[...elements].forEach((elem) => { | |
const offset = getOffset(elem); | |
const div = this.document.createElement("div"); | |
div.className = "pyspider_highlight"; | |
div.setAttribute('style', 'z-index: 888888;border: 2px solid #c00;position: absolute;pointer-events: none;' | |
+'top: '+(offset.top-2)+'px;' | |
+'left:'+(offset.left-2)+'px;' | |
+'width: '+elem.offsetWidth+'px;' | |
+'height: '+elem.offsetHeight+'px;'); | |
this.document.body.appendChild(div); | |
}); | |
} | |
getElementByXpath(path) { | |
return this.document.evaluate(path, this.document, null, this.window.XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; | |
} | |
} | |
window._css_selector_helper = new CSSSelectorHelperServer(this); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment