数据挖掘第三次实验报告
- 1、下载文档前请自行甄别文档内容的完整性,平台不提供额外的编辑、内容补充、找答案等附加服务。
- 2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
- 3、如文档侵犯您的权益,请联系客服反馈,我们会尽快为您处理(人工客服工作时间:9:00-18:30)。
第三次实验报告
班级:数理系091001
学号:091001102
姓名:高攀指导老师:刘建伟
Aprior算法
1•实验目的:
首先从理论角度分析了关联规则挖掘算法与聚类挖掘算法原理及其应用领域,然后介绍了Aprior算法的实现及封装,并设计了可视化界面,对算法进行了测试。
2•实验要求:
利用Aprior算法实现Algonthm数学计算方法。
3•实验原理:
Apnori算法使用一种称作逐层迭代的候选产生测试(candidate generation and test)的方法,k-项目集用于探索(k+1)-项目集。
首先,找出频繁1-项目集的集合,该集合记作L。
L 用于找频繁2-向募集到集合L ,而L用于找L ,如此卞去,直到不能找到频繁k-项目集。
找每一个L均需要一次数据库的扫描。
4•实验内容:
package datanunuig;
unport java.io.*;
unport java.util.*;
/**
*A bare bone clean uuplementation of the Apriori
*algoiitlmi for finding frequent itemsets. Good for educational
*puiposes and as a root class for experiinenting on
*optimizations・
*
*In the latest version the use of DataHaiidler is added for readmg
*the database・
*
*@autlior Michael Holler
*@version 0.& 16.03.2004
*/
public class Apiiori {
iiit pass; // number of passes
iiit total; // total number of frequent itemsets
iiit iniiisup; // minimal support of itemset
String filename; // the filename of the database
Item loot; // the root item of the Trie
BufieredWiiter wiitei7/ the buffer to write the output to
DataHaiidler dh; // the handler for the database
* Default constnictur for creating a Apnori object. */
public ApiioriQ {
this.pass = 0;
this.niiiisup = 4;
this.dh = new DataHandler(n test.dat M); this.root = new Item(0);
}
*Construetur for creatmg a Apiiori object with parameters・
*
*@param filename the name of the database file
*@param niiiisup the minimal support tlueshold
*@param outfile the name of the output file
*/
public Apiion(String filename, mt inuisup, Strmg outfile) { this.pass = 0;
this.nuiisup = nrnisup;
this.dh = new DataHandler(filenaiiie);
this.root = new Item(0);
try{
if (!outfile.equals(,Mr)) {
writer = new BufferedWriter(new FileWriter(outfile)); } } catch (Exception e) {}
}
/**
*Construetur for creatmg a Apiiori object with parameters・
*This one is used with other mining algoritlmis・
*
*@param niiiisup the minimal support tlueshold
*@param datahandler the handler for the database
*/
public Aprioii(int ininsup, DataHandler dataliaiidler) { this.pass = 0;
this.nuiisup = nrnisup;
this.dh = dataliandler;
this.root = new Item(0);
}
*The worklioise method for the basic implementation of
*the Apriori algoiitlun・
*/
public void fiiidFrequentSetsQ {
boolean running = tme;
mt candidates = 0, transactions= 0, pinned = 0, itenisets;
wliile (running) {
this.pass++;
candidates = this.generateCandidates(this.root, new VectoiQ, 1); transactions = this. count SupponQ;
pinned = this.piiineCandidates(this.root);
itemsets = candidates - pinned;
〃correct the candidate count on first pass for priiitiiig
if (this.pass == 1)
candidates = total;
total += itenisets;
if (itemsets <= this.pass && tliis.pass > 1) {
running = false;
}
System.out.piiiitlii(H pass: n + tliis.pass +
total: M + total +
", candidates:" + candidates +
pnmed: M + pruned);
*Method for generatmg new candidates・
*Copies the siblmgs of an item to its children.
*
*©param item the item to wliich generated items are added
*@param depth the deptli of recursion
*@return the number of new candidates generated
*/
public mt generateCandidates(Item item. Vector current, mt depth) { Vector v = item.getChildren(); Item cluld = item;
mt generated = 0;
for (Enumeiation e = v.elements(); e.hasMoieElementsQ;) { cluld = (Item)e.nextElement();
current.add(cluld);
if (depth == this.pass-1) {
generated += tliis.copySiblmgs(cluld. v, curient);
} else {
generated += this.generateCandidates(cluld, current, depth+1); }
current.remove(cluld);
}
return generated;
}
*Method for copying the siblmgs of an Item to its children.
*
the item to which the siblmgs are copied the siblmgs to be
copied
the current itemset to be generated the number of siblmgs
copied public mt copySiblmgs(Item item. Vector siblings,
Vector cunent) {
Enumeration e = siblmgs.elements();
Item parent = item;
Item sibling = new Item();
iiit copied = 0;
wliile (siblmg.getLabelQ < paient.getLabelQ && e.hasMoreElementsQ) { sibling = (Item)e.nextElementQ;
}
wliile (e.hasMoieElementsQ) {
sibling = (Item)e.nextElementQ;
current.add(siblmg);
if (this.pass <= 2 || this.checkSubsets(cunent, this.ioot.getChildien(), 0、1))
{ parent.addCluld(new Item(sibling.getLabel()));
copied++;
}
current.remove(siblmg);
return copied;
}
*@param item
*@param siblings *@param current *©return
*/
}
*Checks if the subsets of the itemset to be generated aie all fiequent. *
*@param current *@param children *@param niaik
*@param depth *©return
*/ the current itemset to be generated
the children in the trie on this depth
the mark in the current iteniset depth of recursion tme if the subsets are fiequent, else false
public boolean checkSubsets(Vector current, Vector cliildren, mt mark, int depth) { boolean ok = true;
Item cluld;
iiit index;
iiit i = depth;
if (cluldien == null) return false;
wliile (ok && (maik <= i)) {
uidex = children. indexOf(cuiTent. element At(i));
if (index >= 0) {
if (depth < tliis.pass-1) {
cluld = (Item)children.elementAt(iiidex);
ok = checkSubsets(cuirent, cliild.getChildienQ, i+1,depth+1); }
} else {
ok = false;
}
i-;
}
return ok;
}
/**
*Method for countmg the supports of the candidates
*generated on tliis pass.
*
*@return the number of transactions from wliich
*the support was counted
*/
public mt countSupport() {
iiit rowcount = 0;
mt[] items;
this.dh.open();
for (items = this.dh.read(); items.length > 0; items = tliis.dh.readQ) { iowcount++;
if (this.pass == 1) {
this.root.mc Support。
;
this.total += generateFiistCandidates(items);
} else {
countSupport(root, items, 0, 1);
}
}
return rowcount;
}
/**
*Method generates the fiist candidates by addmg each item
*found in the database to the children of the root item・ Also
*counts the supports of the items found in the database・
*
*@param items the array of integer items fiom the database
*@return the number of candidates generated
*/
public mt generateFirstCandidates(mt[] items) {
Vector v = root.getCluldren();
Enumeration e = v.elementsQ;
Item item = new Item();
iiit generated = 0;
for (iiit i = 0; i < items length; i++) {
while (e.hasMoieElements() && item.getLabelQ < items[i]) { item =
(Item)e.nextElement();
}
if (item.getLabelQ == itenis[i]) {
item, inc SupportQ;
if (e.hasMoreElements())
item = (Item)e.nextElement();
} else if (item.getLabelQ > itenisfi]) {
int index = v.iiidexOf(item);
Item cluld = new Item(itenis[i]);
cluld .incSupport();
this.root.addChild(child, mdex); generated++;
} else {
Item cluld = new Item(itenis[i]);
cluld. incSupport();
this.root.addChild(child);
generated++;
}
}
return generated;
}
/**
*Adds the cover of the Item given as paraniater and all the
*Items in Trie below it.
*
*@param item the item the cover of which is to be counted
*©param items the array of integer items fiom the database
*@param i the position in the array
*@param depth the deptli of recursion
*/
public void countSupport(Item item. int[] items, mt i、mt depth) {
Vector v = item.getChildien();
Item cliild;
mt tmp;
Enumeration e = v.elements();
// loop tluough the cliildren to check
wliile (e.hasMoieElementsQ) {
cluld = (Item)e .nextElement();
〃break, if the whole tiaiisaction is checked
if (i == items length) { break; }
// do a lineal- search for the child in the transaction starting fiom i tmp = i;
wliile (tmp < items.length && items[tmp] < cliild.getLabelQ) tmp++;
〃if the same item exists, increase support or go deaper
if (tmp < items length && child. getLabelQ == items[tmp]) {
if (depth == this.pass) {
cluld .inc Supp ort();
} else {
countSupport(cluld, items. tmp+L depth+1);
}
i = tmp+1;
}
}
}
/**
*Method for pinning the candidates・ Removes items that are
*not frequent from the Trie・
*
*@param item the item the children of which will be pinned
*@return the number of items pnmed fiom the candidates
*/
public mt pinneCandidates(Item item) {
Vector v = item.getChildren();
Item cluld = item;
iiit pinned = 0;
for (Enumeiation e = new Vector(v).elements(); e.hasMoreElementsQ; ) { cluld = (Item)e .nextElement();
// check infiequency, existence and that it is fiilly counted
if (child.getSupportQ < tliis.niinsup) {
v.remove(cluld);
pmned++;
} else {
piiuied += pmneCandidates(child);
}
}
return pinned;
}
/**
*Method gets and returns the root of the
*candidate trie・
*
* @return the root of the candidate trie
*/
public Item getTrieQ {
return tlus.ioot;
}
*Method prmts tlie itemsets to tlie system output and to a file
*if the name of an output file exists・
*/
public void printFiequentSets() {
if (this.writer != null) {
pnnt(root, “);
}
System.out.pimtlii(H^uiumber of frequent itemsets found: '* + tliis.total); }
/**
*Loops tluough the Trie recursively addmg
*paths and subpaths to tlie output string along the way.
*
*@param item the item where the recursion is
*@param str the string of the gatherd itemset
*/
public void print(Item item. String str) {
Vector v = item.getChildien();
for (Enumeiation e = v.elements(); e.hasMoreElementsQ;) {
item = (Item)e.nextElement();
try{
this .writer. write(str + item.getLabel()
+ n (M + item.getSuppoitQ + ”)S');
this.writer.flush();
} catch (Exception x) {
System.ou匚pnntlnCno output file”);
}
if (item.hasCluldrenQ) {
piiiit(itenL str + item.getLabelQ + n ");
}
}
}
/**
*Main method for testing the algoiitlun・
*
*@param args the arguments can contain the filename
*of the testfile and the minimal support
*tlueshold and a filename for output
*/
public static void niain(Strmg aigs[]) {
String testfile = H test.dat H;
String outfile =";
iiit support = 5;
try{
testfile = args[0];
} catch (Exception e) {
System.out.pimth^^Didnl get filename・ Using M, + testfile +
}
try{
support = new Integer(args[ 1 ]).iiitValue();
} catch (Exception e) {
System.out.pimth^^Didnl get support tliieshold. Using M, + support + } try{
outfile = args[2];
} catch (Exception e) {
System.out.pimth^^Didnl get output filename. Not printmg.");
}
StopWatch sw = new StopWatchQ;
sw.start();
Apriori apriori = new Apiiori(testfile, support, outfile); apnori.findFrequentSetsO;
apnori.prmtFiequentSetsQ;
sw.stop();
sw.priiit();
}
}
5•实验结语:
通过此实验让我明白了什么是Apnon算法和数据之间的关联性,Apnon算法是一种最有影响的挖掘布尔关联规则频繁项集的算法:,很多的的挖掘算法是在Apnon算法的基础上进行改进的,比如基于散列(Hash)的方法,基于数据分割(Paidtum)的方法以及不产生候选项集的FP-GROWTH方法等。
因此要了解关联规则算法不得不先要了解Apnon算法。