home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
PC World Komputer 1997 May
/
Pcwk0597.iso
/
borland
/
ib
/
setups
/
intrabld
/
data.z
/
CRAWLER.JFM
< prev
next >
Wrap
Text File
|
1996-12-11
|
23KB
|
679 lines
//#define DEBUG
/****************************************************************************\
* *
* Crawler.jfm -- Given a URL to start with, this application descends the *
* local web tree indexing all connected pages. It then *
* generates a report that, when viewed through a browser, *
* contains a hyperlink to each page. *
* *
* Crawler.jfm is part of the web utilities solution application. It is meant *
* to be used by systems administrators who would like to index their site. *
* Used in conjunction with the intrabot it is possible to detect which *
* machines are running an http server(using intrabot), and then what pages *
* are on those servers (using crawler). *
* *
* The netsite box on the crawlers main page is linked to the hosts table *
* created by the intrabot. This makes it easy for you to descend the web *
* tree of the hosts detected by the intrabot. You may also enter a valid url *
* for any site (eg. http://www.elvisIsKing.com/) and begin your descent *
* there.(Note when used remotely you are forced to choose a value from the *
* select control. This is a limitation of the current html specifications) *
* You MUST limit the scope of your search by filling in the *
* appropriate range of ip addresses, otherwise this crawler might attempt a *
* trip around the entire internet. That would take a very long time *
* (days, weeks, months, years who knows?). *
* *
* *
* Dependencies: reqstURL.dll *
* localurl.qry *
* urlque.qry *
* borlnd21.jpg *
* websrc11.jpg *
* *
* * *
* Links to: *
* *
* Updated 11/12/96 by IntraBuilder Samples Group *
* $Revision: 1.6 $ *
* *
* Copyright (c) 1996, Borland International, Inc. All rights reserved. *
* *
\****************************************************************************/
//link to external dlls
extern boolean isLocal(char* /*host*/) "reqstURL.dll";
extern boolean setRange(char* /*numeric internet address*/, char* /*numeric internet address*/)"reqstURL.dll";
#define mainPage 1
#define successPage 2
#define errorPage 3
var f = new crawlerForm();
f.argv=CRAWLER.arguments; //this loads arguments into the f.argv.Check the length
//and values in onServerLoad.
f.open();
return;
// {End Header} Do not remove this comment//
// Generated on 11/12/96
//
var f = new crawlerForm();
f.open();
class crawlerForm extends Form {
_sys.scripts.load(_sys.env.home() + "APPS\\SHARED\\CONTROLS.CC")
with (this) {
onServerSubmit = class::Form_onServerSubmit;
onServerLoad = class::Form_onServerLoad;
onServerUnload = class::Form_onServerUnload;
color = "white";
height = 14.6667;
left = 0;
top = 0;
width = 78;
title = "Intranet Crawler";
}
with (this.urlque1 = new Query()){
left = 42;
top = 0;
sql = "@urlQue.QRY";
active = true;
with (rowset) {
}
}
with (this.range1 = new Query()){
left = 48;
top = 0;
sql = "@range.qry";
active = true;
with (rowset) {
}
}
with (this.localurl1 = new Query()){
left = 30;
top = 0;
sql = "@localURL.QRY";
active = true;
with (rowset) {
}
}
with (this.hosts1 = new Query()){
left = 36;
top = 0;
sql = "@hosts.qry";
active = true;
with (rowset) {
}
}
with (this.rule1 = new Rule(this)){
top = 4;
size = 2;
right = 76;
pageno = 0;
}
with (this.rule2 = new Rule(this)){
top = 11.5;
size = 2;
right = 76;
pageno = 0;
}
with (this.runButton = new Button(this)){
onServerClick = class::button1_onServerClick;
top = 10;
width = 10;
text = "Start";
}
with (this.lowIPField = new Text(this)){
left = 40;
top = 7;
width = 28;
value = "";
}
with (this.highIPField = new Text(this)){
left = 40;
top = 8;
width = 28;
value = "";
}
with (this.HTML3 = new HTML(this)){
height = 1;
left = 30;
top = 7;
width = 9;
color = "black";
alignVertical = 1;
alignHorizontal = 2;
text = "From:";
}
with (this.HTML4 = new HTML(this)){
height = 1;
left = 30;
top = 8;
width = 9;
color = "black";
alignVertical = 1;
alignHorizontal = 2;
text = "To:";
}
with (this.HTML5 = new HTML(this)){
height = 1;
top = 5;
width = 76;
color = "black";
text = "Running the crawler might take a long time. Proceed?";
pageno = 2;
}
with (this.okButton = new Button(this)){
onClick = class::okButton_onClick;
top = 7;
width = 14;
text = " OK ";
pageno = 2;
}
with (this.backButtonPage2 = new Button(this)){
onServerClick = class::backButtonPage2_onServerClick;
top = 9;
width = 14;
text = "Back";
pageno = 2;
}
with (this.errorHTML = new HTML(this)){
height = 4;
top = 5;
width = 76;
color = "black";
text = "This is the error page.<br>";
pageno = 3;
}
with (this.displayArea = new TextArea(this)){
visible = false;
height = 4;
left = 16;
top = 7;
width = 54;
value = "";
pageno = 2;
}
with (this.html1 = new HTML(this)){
height = 1;
top = 5;
width = 30;
color = "black";
text = "Begin at Netsite (URL or IP)";
}
with (this.image1 = new Image(this)){
height = 3.4167;
width = 10.25;
dataSource = "filename WEBSRC11.JPG";
alignment = 4;
pageno = 0;
}
with (this.backButtonPage3 = new Button(this)){
onServerClick = class::backButtonPage3_onServerClick;
top = 9;
width = 14;
text = "Back";
pageno = 3;
}
with (this.html2 = new HTML(this)){
height = 2;
left = 12;
width = 64;
color = "80c0";
text = "<h1>Intranet Crawler</h1>";
pageno = 0;
}
with (this.host = new Text(this)){
left = 30;
top = 5;
width = 40;
value = "";
}
with (this.html3 = new HTML(this)){
height = 3;
top = 7;
width = 30;
color = "black";
text = "Enter the IP addresses you would like to search between. (e.g. 123.123.123.123)";
}
with (this.html4 = new HTML(this)){
height = 1.2083;
left = 12;
top = 2;
width = 64;
color = "black";
text = "<H2>will find web objects on the intranet</H2>";
}
with (this.GeneratedHTML1 = new GeneratedHTML(this)){
height = 1;
top = 12;
width = 76;
pageno = 0;
}
this.rowset = this.hosts1.rowset;
function button1_onServerClick()
{
var errorString=new String();
if (form.errorsInFields(errorString)){
form.errorHTML.text=errorString;
form.pageno=errorPage;
return;
}//endif
//passed initial tests now determine if Netsite value is within the range specified by the user.
var checkIP=new IpAddress(form.host.value);
var machineName=new String();
if (checkIP.isValid){ //this is an address of the form 255.255.255.255
form.url=new Url("http://"+checkIP.text+"/") //get the default page for this address
machineName=checkIP.text;
if (machineName.indexOf(":") > 0){ // Strip off port number if has one.
machineName=machineName.substring(0,machineName.indexOf(":"));
}
} else { //now check to see if this is a Url
var tempUrl=new Url(form.host.value);
machineName=tempUrl.host;
if (tempUrl.isValid==false){
form.errorHTML.text="Error: Netsite is not a valid IP address or URL.<br>";
form.pageno=errorPage;
return;
}//end if
form.url=tempUrl;
}//endif
//set the local range
if (!(setRange(form.lowIPField.value,form.highIPField.value))){
form.errorHTML.text="Error: unable to clear resources necessary to proceed. Try again later.<br>";
form.pageno=errorPage;
return;
}//end if
if (isLocal(machineName)==false){ //must be local
form.errorHTML.text="Error: Starting machine is not within specified range.<br>";
form.pageno=errorPage;
return;
}//end if
//passed all the tests show the success page.
form.pageno=successPage;
}
function iterateQue()
{
var tag;
var i=0;
var foundMatchingRecord;
var string2search=new String();
var count=0;
var stringBuffer=new String();
var dataReturned=new String();
var returnCode=new StringEx();
var doRetrieve=true;
var type=new String();
form.urlque1.rowset.first();
while(form.urlque1.rowset.endOfSet==false && form.running==true){
form.urlque1.rowset.first(); //go to top of que
var urlToProcess=new Url(form.urlque1.rowset.fields["URL"].value); //url comes from top of que
//all urls in que are valid local urls
form.localurl1.rowset.beginAppend(); //first save this url to local url table
form.localurl1.rowset.fields["URL"].value=urlToProcess.text
form.url=urlToProcess;
doRetrieve=filterFiles(urlToProcess,type); //check this Url to see if we can tell what it is by its extension
if (doRetrieve==false){
form.displayArea.value="Getting info on " + urlToProcess.text +"\n"; //show the user what's going on
var headResponse=new HeadResponse(urlToProcess);
processHeader(headResponse.headerFile,type, this);
}else{
form.displayArea.value="Retrieving " + urlToProcess.text +"\n"; //show the user what's going on
infoFromServer=new HttpResponse(urlToProcess); //get the response
processFile(infoFromServer, this); //process it
}
form.localurl1.rowset.save();
form.popQue(); //remove url from top of que
}//end while //read in next url
//Finis
return;
}
function errorsInFields(errorString)
{
var errors=false;
errorString="";
var ipLow=new IpAddress(form.lowIPField.value);
var ipHigh=new IpAddress(form.highIPField.value);
if(ipLow.isValid==false){
errorString+="Error: Low IP address is not valid.<br>";
errors=true;
}
if(ipHigh.isValid==false){
errorString+="Error: High IP address is not valid.<br>";
errors=true;
}
machineRange= (ipHigh.toDecimal()-ipLow.toDecimal())+1;
if (machineRange<0){
errorString+="Error:Low address must be a smaller number than the high address.<br>"
errors=true;
}//endif
return errors;
}
function popQue()
{
form.urlque1.rowset.first();
form.urlque1.rowset.delete(); //erase the top of the que
return;
}
function shouldAddToQue(url)
{
if (form.checkLocal(url)==false){
return false;
}
if (url.protocol!="http"){
return false;
}
form.urlque1.rowset.beginLocate();
form.urlque1.rowset.fields["URL"].value=url.text;
form.urlque1.rowset.applyLocate();
if (!(form.urlque1.rowset.endOfSet) ){ //already in urlque file
return false; //so we don't need to check it again
}
return true;
}
function checkLocal(url)
{
if (!url.isValid) { //must be valid url
return false;
}
if (!isLocal(url.host)){ //must be local
return false;
}
form.localurl1.rowset.beginLocate();
form.localurl1.rowset.fields["URL"].value=url.text;
try {
form.localurl1.rowset.applyLocate();
}
catch (Exception e) {
#ifdef DEBUG
_sys.scriptOut.writeln("ERROR: " + e.message + " (" + e.code + ")");
_sys.scriptOut.writeln(" " + url.text);
#endif
return false;
}
if (!form.localurl1.rowset.endOfSet){ //yes already in local file
return false; //if it's already in the local file we don't need to process it again
}
return true;
}
function button2_onServerClick()
{
form.iterateQue();
}
function Form_onServerUnload()
{
this.running=false; //keep track of when the form is running
//unload the class definitions
_sys.scripts.unload("netClass.js");
}
function Form_onServerLoad()
{
//load the class definitions
//this speeds up calls to functions defined in this .js file
_sys.scripts.load("netClass.js");//
loadExternalFunctions(); //must call this immediately after loading net classes.
this.submit=this.onServerSubmit;
this.url=new Url("");
this.running=true; //keep track of when the form is running
if (this.argv.length>0) { //this form was launched with arguments
this.host.value=this.argv[0];
}
if (!(this.range1.rowset.endOfSet)) {
this.lowIPField.value=this.range1.rowset.fields["low IP"].value;
this.highIPField.value=this.range1.rowset.fields["high IP"].value;
}
this.pageno=mainPage; //turn to front page
}
function Form_onServerSubmit()
{
var i=0;
//technical detail. empty the tracking tables first.
this.urlque1.rowset.first();
while( !(this.urlque1.rowset.endOfSet) ){
this.popQue();
}
this.localurl1.rowset.first();
while(!(this.localurl1.rowset.endOfSet) ){
this.localurl1.rowset.first();
this.localurl1.rowset.delete(); //erase the top of the que
}
if (this.urlque1.rowset.count() >0 || this.localurl1.rowset.count()>0){
this.errorHTML.text="Error: Unable to clear resources necessary to proceed. Try again later.<br>";
this.pageno=errorPage;
return;
}
this.displayArea.visible=true; //Show the display area when running in the ide.
//This will do nothing when viewed through the browser
//the display area will still be invisible there which
//is the way we want it.
this.urlque1.rowset.beginAppend(); //if yes, put it on the que
this.urlque1.rowset.fields["URL"].value=form.url.text;
this.urlque1.rowset.save();
this.iterateQue(); //now iterate the que
_sys.reports.run("crawler",1,1);
this.displayArea.visible=false; //When functions is finished there is no need to
//display progress.
return;
}
function backButtonPage2_onServerClick()
{
form.pageno=mainPage; //back to the opener
}
function okButtonPage3_onServerClick()
{
form.pageno=mainPage;
}
function backButtonPage3_onServerClick()
{
form.pageno=mainPage
}
function getPath(URL)
{
// {Export} This comment causes this function body to be sent to the client
//This function returns the path of the url minus the filename
var path=unescape(URL.substring(0,(URL.lastIndexOf("/"))+1));
return path;
}
function okButton_onClick()
{
setTimeout("showStatus()",20000);
this.form.submit();
//document.form[0].submit(); //Use this line with MS IE 3.0 . IE still has trouble with Java Script.
}
function showStatus()
{
// {Export} This comment causes this function body to be sent to the client
var path=getPath(window.location.href);
// window.open(path+"update.jfm","","height=200,width=400");
window.open("/svr/intrasrv.isv?apps/webutils/update.jfm","","height=200,width=400");
}
}
function processReturnCode(header, form)
{
//this function checks for the 200 ok response
//If we get it we return true
//if we don't we mark the error to the table and return false
var validResponse=true;
var returnCode=header.returnCode;
if (returnCode.indexOf("200")<0) { //didn't receive the "200 ok" response
validResponse=false;
}//endif
return validResponse;
}
function processFile(infoFromServer, form)
{
if (processReturnCode(infoFromServer.header, form)){
form.localurl1.rowset.fields["Size of file"].value=infoFromServer.header.contentLength;
form.localurl1.rowset.fields["Last modified"].value=infoFromServer.header.lastModified;
form.localurl1.rowset.fields["Type of file"].value=infoFromServer.header.contentType;
tag= infoFromServer.html.getReferences(); //collect all the href tags
for(i=0; i<tag.length; i++){ //check each href tag
var tempUrl=new Url(tag[i]);
if (form.shouldAddToQue(tempUrl) ){
form.urlque1.rowset.beginAppend(); //add it.
form.urlque1.rowset.fields["URL"].value=tempUrl.text;
form.urlque1.rowset.save();
}
}//end for
}else{
var returnCode=infoFromServer.header.returnCode;
form.localurl1.rowset.fields["Type of file"].value=returnCode.substring(4,returnCode.length);
}
}
function processHeader(header,type, form)
{
if (processReturnCode(header,type,form)){
form.localurl1.rowset.fields["Size of file"].value=header.contentLength;
form.localurl1.rowset.fields["Last modified"].value=header.lastModified;
form.localurl1.rowset.fields["Type of file"].value=header.contentType;
}else{
form.localurl1.rowset.fields["Type of file"].value=type;
form.localurl1.rowset.fields["Size of file"].value="not available";
form.localurl1.rowset.fields["Last modified"].value="not available";
}
}
function filterFiles(url,type)
{
//don't bother retrieving any files with .gif, or .jpeg extension. Those
//won't have links to other pages.
var periodIndex=0;
var badExtensions=new AssocArray(); //construct an array of extensions we don't want to retreive
badExtensions["zip"]="compressed file";
badExtensions["jpeg"]="image/jpg";
badExtensions["jpg"]="image/jpg";
badExtensions["tiff"]="image/tiff"
badExtensions["map"]= "map file";
badExtensions["gif"]="image/gif";
badExtensions["exe"]="executable";
badExtensions["bat"]="batch";
badExtensions["cmd"]="command file";
badExtensions["pl"]="pearl script";
badExtensions["sh"]="bourne script";
badExtensions["csh"]="c-shell script";
badExtensions["ksh"]="korne script";
badExtensions["gzip"]="gunzip file";
badExtensions["tar"]="tar compressed";
badExtensions["class"]="java applet";
badExtensions["pdf"]="adobe portable document";
badExtensions["rtf"]="rich text";
badExtensions["tex"]="x-tex";
badExtensions["dvi"]="x-dvi";
badExtensions["ras"]="raster";
badExtensions["xbm"]="x-bitmap";
badExtensions["xpm"]="x-pixmap";
badExtensions["xwd"]="x window dump";
badExtensions["ai"]="post-script";
badExtensions["eps"]="post-script";
badExtensions["ps"]="post-script";
badExtensions["cpio"]="x-cpio";
badExtensions["sit"]="stuffit";
badExtensions["hqx"]="bin-hex";
badExtensions["avi"]="video";
badExtensions["mpeg"]="video";
badExtensions["mpg"]="video";
badExtensions["mpe"]="video";
badExtensions["mpv"]="video";
badExtensions["mpegv"]="video";
badExtensions["vbs"]="video";
badExtensions["ra"]="Real Audio";
badExtensions["rm"]="Real Audio";
badExtensions["wav"]="Wave file";
badExtensions["Z"]="Compressed file";
//To find extension
if (badExtensions.isKey(url.extension)){ //see if it is marked as a type of file we don't
type=badExtensions[url.extension]; //want to retrieve
return false;
}//endif
return true;
}//end filterFiles();
function getPath(URL)
{
// {Export} This comment causes this function body to be sent to the client
//This function returns the path of the url minus the filename
var path=unescape(URL.substring(0,(URL.lastIndexOf("/"))+1));
return path;
}
function setTimeout(x,y)
{
}