lucene实例

合集下载

1、下载文档前请自行甄别文档内容的完整性，平台不提供额外的编辑、内容补充、找答案等附加服务。
2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
3、如文档侵犯您的权益，请联系客服反馈,我们会尽快为您处理(人工客服工作时间：9:00-18:30)。

package .learn.lucene.myLucene;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import .smart.SmartChineseAnalyzer; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
public class MyLucene {
public static int t=0;
//索引目录
//private static Directory indexDir ;
//分词器
//private static Analyzer analyzer ;
//private static IndexWriterConfig iwc;
//索引添加、更新器
private static IndexWriter writer;
//索引创建方式
//private static OpenMode openMode = OpenMode.CREATE_OR_APPEND;
private static DirectoryReader reader;
public static void main(String[] args) {
System.out.println("ss");
try {
createIndex();
searchFile();
} catch (Exception e){
e.printStackTrace();
}
}
public static void createIndex() throws Exception {
OpenMode openMode = OpenMode.CREATE_OR_APPEND;
/** 这里放索引文件的位置*/
Directory indexDir =FSDirectory.open( new File("d:\\index") );//索引目录
//Analyzer analyzer = new ChineseAnalyzer();
Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_45);
//Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_45); //分析器
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_45,analyzer);
//iwc.setOpenMode( openMode );//分词器
writer = new IndexWriter(indexDir, iwc);//IndexWriter将文档加入索引
//使用writer做为reader的源可减少消耗
reader = DirectoryReader.open( writer,false );
File file = new File("d:\\file");//文件的编码格式一定是UTF-8,对中文的支持
readFile(file);
mit();
}
public static void readFile(File file) throws Exception{
// 增加document到索引去
System.out.println("file.isDirectory() "+file.isDirectory()+" "+file.getPath());
if (file.isDirectory()){
File[] files = file.listFiles();
for (int i = 0; i < files.length; i++){
file = files[i];
if (file.isDirectory()){
readFile(file);
continue;
}
Document doc = new Document();
String filePath = file.getPath();
System.out.println("*******readFile*******"+filePath);
//路径
doc.add(new StringField("path", filePath, Field.Store.YES));
//修改时间
//Thread.currentThread().sleep(1000);
doc.add(new LongField("lastModifiedTime", (new Date()).getTime(), Field.Store.YES));
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
doc.add(new TextField("contents", br));
doc.add(new Field("fieldname", "详细信息",TextField.TYPE_STORED));//可以对其检索和存储
doc.add(new Field("field", "asd t ",TextField.TYPE_STORED));//英文空格匹配,中文单字匹配
Term term = new Term("path",filePath);//搜索的最小单位(即文档)
writer.updateDocument( term,doc);//使用更新不会对文档重复建索引
//writer.addDocument(doc);
}
}
}
/**
* 分词建索引(目标文件,目标源),分词搜索(参数字符串),分词命中
*/
public static void searchFile() throws IOException, ParseException, InvalidTokenOffsetsException{
BooleanQuery bq = new BooleanQuery();
DirectoryReader tmpRe = DirectoryReader.openIfChanged(reader);
if( tmpRe != null ){
reader = tmpRe;
}
IndexSearcher searcher = new IndexSearcher(reader);
//Analyzer analyzer = new ChineseAnalyzer();
Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_45);
//Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_45);
QueryParser parser = new QueryParser(Version.LUCENE_45, "contents", analyzer);
String para = "中华人民共和国公民";
Query query = parser.parse(para);//根据文件内容搜索文件路径
/**
* 包含全部短语只有"中华人民共和国公民"命中(被分词后分出来的字词组成短语并被命中)
*/
// String[] words = query.toString("contents").split("\\s+");
// PhraseQuery pq = new PhraseQuery();
// for( String wd : words ){
// pq.add( new Term("contents",wd) );
// }
// bq.add( pq,Occur.MUST );
/**
* 包含全部字词只有"中华人民共和国公民"或"公民中华人民共和国"被命中(被分词后分出来的字词全部命中)
*/
// String[] words = query.toString("contents").split("\\s+");//空格回车换行符
// for( String wd : words ){
// TermQuery tq = new TermQuery( new Term("contents",wd) );
// bq.add(tq, Occur.MUST);
// }
/**
* 包含任意字词(被分词后分出来的字词有一个命中就行)
* 用SmartChineseAnalyzer分析器中华人民共和国被切为一个词"中华人民共和国公民" 和"中华人民共和国"都会被命中
*/
bq.add(query,Occur.MUST);
//排序,降序可根据排序和查询总条数实行分页
boolean isDesc = true;
SortField sfd = new SortField("lastModifiedTime", SortField.Type.LONG,isDesc);
Sort sort = new Sort( sfd );
TopDocs results=searcher.search(bq, 100,sort);
ScoreDoc[] sds = results.scoreDocs;
Document doc;
for(int begin = 0; begin<sds.length; begin++ ){
ScoreDoc sd = sds[begin];
doc = searcher.doc( sd.doc);
String path = doc.get( "path");
String time = doc.get( "lastModifiedTime");
String title = doc.get("title");
//高亮
String content = readFile(path);
String hightContent = highlightText(query,content);
System.out.println("searchFile******"+path+" "+time+" *content:"+content+"* contents*"+hightContent);
}
}
public static String highlightText( Query query,String content) throws ParseException, IOException, InvalidTokenOffsetsException{
//默认使用HTML的<B>标签标记关键词
//样式
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
//用于高亮查询,query是Lucene的查询对象Query
Scorer scorer = new QueryScorer(query);
//创建一个高亮器
Highlighter highlighter = new Highlighter(formatter, scorer);
//设置文本摘要大小
Fragmenter fragmenter = new SimpleFragmenter(1000);
highlighter.setTextFragmenter(fragmenter);
//要高亮显示的内容：
//从Lucene的Document对象中取出文本内容
Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_45);
//高亮显示content
String highlighterContent = highlighter.getBestFragment(analyzer, null, content);
//如果content中没有找到关键词,会返回空。

return highlighterContent;
}
/**
* 读文件
*/
public static String readFile( String path ) throws IOException {
File file = new File( path );
if( file.exists() ){
try {
BufferedReader br = new BufferedReader( new InputStreamReader( new FileInputStream(file),"utf-8"));
StringBuffer buf = new StringBuffer();
String tmp = br.readLine();
while( tmp != null ){
buf.append(tmp);
tmp = br.readLine();
}
String bufStr = buf.toString();
return bufStr;
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}else{
System.out.println("不存在");
}
return null;
}
@Test
public void mainTest() throws Exception{
File file = new File( "D://file//file1.txt" );
if( file.exists() ){
try {
BufferedReader br = new BufferedReader( new InputStreamReader( new FileInputStream(file),"utf-8"));
StringBuffer buf = new StringBuffer();
String tmp = br.readLine();
while( tmp != null ){
buf.append(tmp);
tmp = br.readLine();
}
String bufStr = buf.toString();
System.out.println(bufStr);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}else{
System.out.println("不存在");
}
}
}。