Nutch二次开发总结
合集下载
相关主题
- 1、下载文档前请自行甄别文档内容的完整性,平台不提供额外的编辑、内容补充、找答案等附加服务。
- 2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
- 3、如文档侵犯您的权益,请联系客服反馈,我们会尽快为您处理(人工客服工作时间:9:00-18:30)。
// and that text is not too large...
(text.length()<maxDocBytesToAnalyze)
)
{
docFrags.add(currentFrag);
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
try
{
org.apache.lucene.analysis.Token token;
public final TextFragment[] getBestTextFragments(
TokenStream tokenStream,
String text,
boolean mergeContiguousFragments,
int maxNumFragments)
throws IOException
{
ArrayList docFrags = new ArrayList();
StringBuffer newText=new StringBuffer();
* 底层API,获取文档中最相关的(格式化)部分
* This method has been made public to allow visibility of score information held in TextFragment objects.
* Thanks to Jason Calabrese for help in redefining the interface.
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
// markup the cached token group info
startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.matchEndOffset;
startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.matchEndOffset;
tokenText = text.substring(startOffset, endOffset);
newText.append(markedUpText);
lastEndOffset=Math.max(endOffset, lastEndOffset);
tokenGroup.clear();
* @param tokenStream
* @param text
* @param maxNumFragments
* @param mergeContiguousFragments
* @throws IOException
*/
//check if current token marks the start of a new fragment
if(textFragmenter.isNewFragment(token))
{
{
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
{
//the current token is distinct from previous tokens -
Nutch二次开发总结
通过一系列的离线活动(对于查询用户而言)的开展,Nutch检索系统相对而言变得简单了许多。在二次开发的时候,需要重点对Nutch的界面及界面显示数据进行适当的调整。
1 摘要提取
1.1 摘要提取源码分析
**
* Low level api to get the most relevant (formatted) sections of the document.
}
currentFrag.setScore(fragmentScorer.getFragmentScore());
if(tokenGroup.numTokens>0)
{
//flush the accumulated text (same code as in above loop)
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
// if(lastEndOffset>maxDocBytesToAnalyze)
// {
// break;
// }
token = tokenStream.next();
newText.append(markedUpTeastEndOffset,endOffset);
}
//Test what remains of the original text beyond the point where we stopped analyzing
String tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
textFragmenter.start(text);
if (
// if there is text beyond the last token considered..
(lastEndOffset < text.length())
&&
currentFrag.setScore(fragmentScorer.getFragmentScore());
//record stats for a new fragment
currentFrag.textEndPos = newText.length();
//append it to the last fragment
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
}
TokenGroup tokenGroup=new TokenGroup();
token = tokenStream.next();
while ((token!= null)&&(token.startOffset()<maxDocBytesToAnalyze))
currentFrag.textEndPos = newText.length();
//sort the most relevant sections of the text
for (Iterator i = docFrags.iterator(); i.hasNext();)
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
}
}
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
(text.length()<maxDocBytesToAnalyze)
)
{
docFrags.add(currentFrag);
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
try
{
org.apache.lucene.analysis.Token token;
public final TextFragment[] getBestTextFragments(
TokenStream tokenStream,
String text,
boolean mergeContiguousFragments,
int maxNumFragments)
throws IOException
{
ArrayList docFrags = new ArrayList();
StringBuffer newText=new StringBuffer();
* 底层API,获取文档中最相关的(格式化)部分
* This method has been made public to allow visibility of score information held in TextFragment objects.
* Thanks to Jason Calabrese for help in redefining the interface.
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
// markup the cached token group info
startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.matchEndOffset;
startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.matchEndOffset;
tokenText = text.substring(startOffset, endOffset);
newText.append(markedUpText);
lastEndOffset=Math.max(endOffset, lastEndOffset);
tokenGroup.clear();
* @param tokenStream
* @param text
* @param maxNumFragments
* @param mergeContiguousFragments
* @throws IOException
*/
//check if current token marks the start of a new fragment
if(textFragmenter.isNewFragment(token))
{
{
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
{
//the current token is distinct from previous tokens -
Nutch二次开发总结
通过一系列的离线活动(对于查询用户而言)的开展,Nutch检索系统相对而言变得简单了许多。在二次开发的时候,需要重点对Nutch的界面及界面显示数据进行适当的调整。
1 摘要提取
1.1 摘要提取源码分析
**
* Low level api to get the most relevant (formatted) sections of the document.
}
currentFrag.setScore(fragmentScorer.getFragmentScore());
if(tokenGroup.numTokens>0)
{
//flush the accumulated text (same code as in above loop)
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
// if(lastEndOffset>maxDocBytesToAnalyze)
// {
// break;
// }
token = tokenStream.next();
newText.append(markedUpTeastEndOffset,endOffset);
}
//Test what remains of the original text beyond the point where we stopped analyzing
String tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
textFragmenter.start(text);
if (
// if there is text beyond the last token considered..
(lastEndOffset < text.length())
&&
currentFrag.setScore(fragmentScorer.getFragmentScore());
//record stats for a new fragment
currentFrag.textEndPos = newText.length();
//append it to the last fragment
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
}
TokenGroup tokenGroup=new TokenGroup();
token = tokenStream.next();
while ((token!= null)&&(token.startOffset()<maxDocBytesToAnalyze))
currentFrag.textEndPos = newText.length();
//sort the most relevant sections of the text
for (Iterator i = docFrags.iterator(); i.hasNext();)
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
}
}
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));