PageRank算法

合集下载
  1. 1、下载文档前请自行甄别文档内容的完整性,平台不提供额外的编辑、内容补充、找答案等附加服务。
  2. 2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
  3. 3、如文档侵犯您的权益,请联系客服反馈,我们会尽快为您处理(人工客服工作时间:9:00-18:30)。

if (total<1)
{
cout<<"BuildIndex Error and the program exit"<<endl;
return 0;
}
size_t DocID=0;
int outDegree = 0;
fclose(file);
file = fopen("indexMap.dat","rb");
SortMap.dat是一个链接关系文件,已经按照DocID(每个网页的唯一编号,64位整型)进行排序了.基本上对这样的大文件排序都是进行分割,排序,归并实现的(大家有兴趣的话我下次将排序源代码也贴上来).如果您手头没有这样的链接数据话/labs/dl/t-link.html(互联网语料链接关系库),这个里面可以提取,然后就是排序,然后就是下面的源代码计算,你就能看到传说中的PageRank了.Good Luck!
unsigned int MaxOutDegree;
fread(&total,4,1,countFile);
fread(&MaxOutDegree,4,1,countFile);
fclose(countFile);
cout<<"total is :\t"<<total<<"\tand the MaxOutDegree:"<<MaxOutDegree<<endl;
size_t position = total;
size_t index=0;
for (size_t iter=0;iter<30;++iter)
{
rewind(file);
start = GetTickCount();
for (size_t i=0;i<total;++i)
for (int k=0;k<outDegree;++k)
{
fread(sons,4,1,file);
if (total > *sons && *sons>=0)
{
prTmp[ *sons ] += fatherRank;
}
//fread(sons,LENGTHOFSINGLE+1,1,mapFile);
DocID = *(unsigned __int64 *)(sons+j*(LENGTHOFSINGLE+1));
position = binaryFind(index,index+total,DocID);
prTmp[i] = 0.0f;
}
end = GetTickCount();
cout<<"end of index:"<<end-start<<endl;
ofstream fout("pagerank.txt");
float fatherRank = 1;
const float alpha = 0.85f;
最后面是源代码,前面是简单的原理介绍
转载请注明出处: /32167163.html
曾经在网上找了很多关于PageRank的论文,基本上都是介绍原理的,没有找到计算过程的具体实现的源代码,前一阵公司有需要于是写了一个,比较简单(没有反作弊,Blog链接的处理,黑洞的处理都没有管),就是用极限编程的思想用最快的速度实现了一个个人感觉计算效率还不错,(没分块,其实前期分块后对后续的计算过程也是一样的了P4 3.0,512M),1700万网页迭代一次需要25秒钟的样子.
FILE *DocIDIndex = fopen("DocID.Index","rb");
cout<<"start the total is:"<<total<<endl;
if(total>0)
{ //真正实现的计算过程
unsigned int * sons = new unsigned int[1];
rewind(mapFile);
size_t position;
for(size_t i=0;i<total;++i)
{
fread(&DocID,LENGTHOFSINGLE,1,mapFile);
fread(&outDegree,4,1,mapFile);
fread(sons,LENGTHOFSINGLE+1,outDegree,mapFile);
fwrite(&i,4,1,indexMap);
fwrite(&outDegree,4,1,indexMap);
for (int j=0;j<outDegree;++j)
{
temp = *(begin+mid);
if(temp==value)
{
return mid;
}
if(temp>value)
{
high=mid-1;
continue;
}
else
{
low=mid+1;
continue;
start = GetTickCount();
float *pageRank = new float[total];
float *prTmp = new float[total];
for (size_t i=0;i<total;++i)
{
pageRank[i] = 1.0f;
template <typename type_key>
size_t binaryFind(type_key *begin,type_key *end,const type_key &value)
{
if(begin>=end) return end-begin;
//unsigned __int64 size = sizeof(type_key);
delete []sons;
}
cout<<"over build the index"<<endl;
return total;
}
int main(int argc, char *argv[])
{
ofstream fout("log.txt");
FILE *file = NULL;
#include <iostream>
#include <fstream>
#include <vector>
#include ห้องสมุดไป่ตู้hash_map>
#include <hash_set>
#include <assert.h>
#include "../FileSort/Sort.h" //一个实现快速排序的文件,后面需要排序的函数都在这个文件中
FILE *DocIDIndex = fopen("DocID.index","wb");
unsigned __int64 DocID;
int outDegree;
for(size_t i=0;i<total;++i)
{
fread(index+i,LENGTHOFSINGLE,1,mapFile);
if (position != total)
{//找到
fwrite(&position,4,1,indexMap);
}
else fwrite(&total,4,1,indexMap);
}
}
fcloseall();
delete []index;
fread(&outDegree,4,1,mapFile);
fread(sons,LENGTHOFSINGLE+1,outDegree,mapFile);
}
cout<<"end of read file"<<endl;
fwrite(index,LENGTHOFSINGLE,total,DocIDIndex);
unsigned int start,end;
start = GetTickCount();
size_t total=BuildIndex(file);
end = GetTickCount();
cout<<"Build index lost:\t"<<end-start<<endl;
//long len = (end-begin);
int total = (end-begin);
int low=0;
int high=total-1;
int mid=0;
unsigned __int64 temp = 0;
while(low<=high)
{
mid = (low+high)/2;
}
}
return total;
}
//完成将DocID转换成Index的过程,以后每次迭代不用再进行查找,算是中间的辅助函数
unsigned int BuildIndex(FILE * mapFile)
{
cout<<"start to build index"<<endl;
FILE *countFile = fopen("count.dat","rb");
if (countFile==NULL)
{
cout<<"count.dat can not open,please check the file"<<endl;
return 0;
}
unsigned int total;
#define BLOCKSIZE (64*1024*1024)
//#define GROUP 8
#define LENGTHOFSINGLE 8
#define OUTDEGREESIZE 2048
#pragma pack(1)
using namespace std;
//实现二分查找功能
}
}
for (int i=0;i<total;++i)
{
prTmp[i] = 0.15f + alpha*prTmp[i];
pageRank[i] = prTmp[i];
prTmp[i] = 0.0f;
}
end = GetTickCount();
PageRank计算实现原理 PageRank基本原理 可能有点帮助吧,中文的PageRank学习笔记
后面是计算PageRank的源代码,希望大家喜欢:(这里贴源代码确实太难看了 )
#include <windows.h>
#include <winbase.h>
#include <stdio.h>
if (total>0)
{
unsigned __int64 *index = new unsigned __int64[total];
char *sons = new char[MaxOutDegree*(LENGTHOFSINGLE+1)];
FILE *indexMap = fopen("indexMap.dat","wb");
if(argc>=2)
file = fopen(argv[1],"rb");
else
file = fopen("F:\\ren\\SortMap.dat","rb");
if(file==NULL)
cout<<"--------can not open the file--------"<<endl;
{
fread(&DocID,4,1,file);
fread(&outDegree,4,1,file);
if (outDegree==0)
{
continue;
}
fatherRank = pageRank[i]/outDegree;
相关文档
最新文档