搜索引擎原理与实践_源程序
- 1、下载文档前请自行甄别文档内容的完整性,平台不提供额外的编辑、内容补充、找答案等附加服务。
- 2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
- 3、如文档侵犯您的权益,请联系客服反馈,我们会尽快为您处理(人工客服工作时间:9:00-18:30)。
9.6 源程序
9.6.1 FrontierSchedulerForBjfu类
package org.archive.crawler.postprocessor;
import org.archive.crawler.datamodel.CandidateURI;
public class FrontierSchedulerForBjfu extends FrontierScheduler { public FrontierSchedulerForBjfu(String name){
super(name);
}
protected void schedule(CandidateURI caUri) {
String uri = caUri.toString();
if(uri.indexOf("dns:") != -1){
getController().getFrontier().schedule(caUri);
}
else if(uri.indexOf("bjfu") != -1
&& (uri.indexOf(".html") != -1
|| uri.indexOf(".htm") != -1
|| uri.indexOf(".jsp") != -1
|| uri.indexOf(".asp") != -1
|| uri.indexOf(".aspx") != -1)){
System.out.println(uri);
getController().getFrontier().schedule(caUri);
}
}
}
9.6.2 Page类
package .bjfu.search.page;
public class Page {
private String url;
private String title;
private String summary;
private String context;
private int score;
public Page(){
url = null;
title = null;
summary = null;
context = null;
score = 10;
}
public String getUrl(){
return this.url;
}
public void setUrl(String url){
this.url = url;
}
public String getTitle(){
return this.title;
}
public void setTitle(String title){
this.title = title;
}
public String getSummary(){
return this.summary;
}
public void setSummary(String summary){ this.summary = summary;
}
public String getContext(){
return this.context;
}
public void SetContext(String context){ this.context = context;
}
public int getScore(){
return this.score;
}
public void setScore(int score){
this.score = score;
}
}
9.6.3 Extractor类
package .bjfu.search.extractor;
import org.htmlparser.*;
import org.htmlparser.util.*;
import org.htmlparser.visitors.*;
import org.htmlparser.nodes.*;
import org.htmlparser.tags.*;
import .bjfu.search.page.*;
import .bjfu.search.util.*;
public class Extractor implements Runnable{
private String filename;
private Parser parser;
private Page page;
private String encode;
public void setEncode(String encode){
this.encode = encode;
}
private String combineNodeText(Node[] nodes){
StringBuffer buffer = new StringBuffer();
for(int i = 0; i < nodes.length; i++){
Node anode = (Node)nodes[i];
String line = null;
if(anode instanceof TextNode){
TextNode textnode = (TextNode)anode;
line = textnode.getText();
}
else if (anode instanceof LinkTag){
LinkTag linknode = (LinkTag) anode;
line = linknode.getLinkText();
}
else if (anode instanceof Div){
if(anode.getChildren() != null){
line = combineNodeText(anode.getChildren().toNodeArray());
}
}
else if (anode instanceof ParagraphTag){
if(anode.getChildren() != null){