Google.js
I am learning to love Windows. Yeh it has its own problems and sometimes could be a bit insecure but in general, it just works well and every single piece of it is reusable and scriptable. Today, I quickly wrote a Google search scraper. It runs straight from the command line.
Pay attention on the size of the script and the level of clarity of the code. Moreover, it is well integrated with the host's setup and connection settings (proxies, socks, etc). You cannot do that even with Python. Thanks Microsoft.
[/files/2007/10/google.js](/files/2007/10/google.js)
There are two way you can run the script. If your default scripting engine is cscript
, then you can just type: google.js **some query here**
. If this is not the case then you either need to be explicit like this: cscript /nologo google.js **some query here**
, or make cscript default like this: cscript //H:CScript
. Whatever you do, the code will run and will work flawlessly.
I enjoy when things are plain and easy or just simply clever.
Archived Comments
cscript //H:CScript
if (WScript.Arguments.length == 0) {
WScript.Echo('usage: ' + WScript.ScriptName + ' <query>');
WScript.Echo(' ' + WScript.ScriptName + ' site:gnucitizen.org ext:js');
WScript.Echo('');
WScript.Echo('Google Search');
WScript.Echo('by Petko D. Petkov (pdp) GNUCITIZEN (http://www.gnucitizen.org)');
WScript.Quit(1);
} else {
var tmp = [];
for (var i = 0; i < WScript.Arguments.length; i++) {
tmp.push(WScript.Arguments(i));
}
var query = tmp.join(' ');
}
var pos = 0;
var doc = WScript.CreateObject('MSXML2.DOMDocument');
var xhr = WScript.CreateObject('Microsoft.XMLHTTP');
var resp;
doc.async = false;
doc.validateOnParse = false;
do {
var lns = [];
xhr.open('GET','http://www.google.com/xhtml?q=' + escape(query) + (pos != 0 ? '&start=' + pos : ''),false);
//xhr.setRequestHeader("Accept-Encoding", "text");
xhr.send(null);
//doc.load('http://www.google.com/xhtml?q=' + escape(query) + (pos != 0 ? '&start=' + pos : ''));
try{
doc.loadXML(xhr.responseText);
//WScript.Echo(xhr.responseText);
//WScript.Echo('XML:' + xhr.responseText);
var as = doc.getElementsByTagName('a');
for (var i = 0; i < as.length; i++) {
var href = as[i].getAttribute('href');
//WScript.Echo(href);
var match = href.match(/^\/gwt\/.*?u=(.*?)$/);
//WScript.Echo(match);
if (match) {
var ln = unescape(match[1]);
lns.push(ln);
}
}
}catch(ex){
//WScript.Echo(ex.description);
}
if (pns && pns.sort().join() == lns.sort().join()) {
break;
}
for (var i = 0; i < lns.length; i++) {
WScript.Echo(lns[i]);
}
var pns = lns;
pos += 10;
} while (lns);