JAVA实现大文件排序
package com.scott.util;
import java.io.*;
import java.util.ArrayList;
import https://www.360docs.net/doc/117896929.html,parator;
import java.util.Iterator;
import java.util.List;
/**
* Created by Scott on 2017/11/1.
*/
public class LargeFileDataSort {
// 测试大文件路径
public final static String testFilePath = "E:/dataTest/largeFileData.txt";
public final static String resultFilePath = "E:/dataTest/largeFileResult.txt";
// 切分大文件的小文件大小MB, 默认为100MB
private final static int size = 200;
private static int byteSize = size * 1024 * 1024;
public static void main(String[] args) throws IOException {
// 生成测试文件
createTestData();
Long start = System.currentTimeMillis();
work();
Long end = System.currentTimeMillis();
System.out.println((end - start) / 1000/ 60);
}
/**
* 切分文件每份大小
*/
public static void work() throws IOException {
File file = new File(testFilePath);
if (!file.exists()) {
return;
}
// 2.1 得到文件大小MB
double mbsize = file.length() / 1024 / 1024;
// 2.2 计算得到切分的文件数
double fileNum = Math.ceil(mbsize / size);
// 2.3 临时文件
List
// 2.3 切分文件
divAndFirstSort(file, tempFileList);
// 2.4 递归排序(每个文件读取多少数据放到内存排序后合并到结果文件)
// 排序合并开始
mergeLargeFile(tempFileList);
// 2.5 TODO 把临时文件删除
}
/**
* 生成测试文件
*/
public static void createTestData() {
StringBuffer sb = new StringBuffer();
BufferedWriter bw = null;
try {
File testFile = new File(testFilePath);
if (!testFile.exists()) {
testFile.createNewFile();
}
bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(testFile)));
for (long i = 1; i <= 27777777; i++) {
sb.setLength(0);
sb.append(i).append("@@");
// sb.append(random.nextInt(100000)).append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");
sb.append("\n");
bw.write(sb.toString());
if ((i + 1) % 5000 == 0) {
bw.flush();
}
System.out.println(i);
}
} catch (IOException e) {
System.out.println("生成测试文件失败!" + e.getMessage());
} finally {
try {
if (bw != null) {
bw.close();
}
} catch (IOException e) {
}
}
}
/**
* 把临时文件合并到结果文件
* @param tempFileList
* @throws IOException
*/
public static void mergeLargeFile(List
List
for(int i=0; i< tempFileList.size(); i++) {
FileEntity le = new FileEntity(new BufferedReader(new InputStreamReader(new FileInputStream(tempFileList.get(i)))));
bwList.add(le);
}
BufferedWriter resultBw = null;
try {
resultBw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(resultFilePath)));
Long count = 0L;
FileEntity fe = null;
while ((fe = getFirstFileEntity(bwList)) != null) {
System.out.println("--- 写文件id: " + fe.getId());
// 写入符合条件的一行数据
resultBw.write(fe.getLine() + "\n");
// 准备下一行
fe.nextLine();
// 清缓冲流
if (count % 1000 == 0) {
resultBw.flush();
}
}
} catch (Exception e) {
} finally {
if (resultBw != null) {
try {
resultBw.close();
} catch (IOException e) {
}
}
}
// 关闭
for(int i=0; i< bwList.size(); i++) {
bwList.get(i).close();
}
}
/**
* 从切分的文件中按序行读取(因为切分文件时已经做好了排序)* @param bwList
* @return
*/
private static FileEntity getFirstFileEntity(List
return null;
}
Iterator
while (it.hasNext()) {
FileEntity fe = it.next();
// 如果文件读到完就关闭流和删除在集合的文件流
if (fe.getLine() == null) {
fe.close();
it.remove();
}
}
if (bwList.size() == 0) {
return null;
}
// 排序获取一行数据
bwList.sort(new FileEntityComparator());
// 返回第一个符合条件的文件对象
return bwList.get(0);
}
/**
* 切分文件并做第一次排序
* @param file
* @param tempFileList
private static void divAndFirstSort(File file, List
BufferedReader br = null;
try {
// 读取大文件
br = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
// 行数据保存对象
String line = null;
// 临时文件索引
int index = tempFileList.size() - 1;
// 第一个临时文件
File tempFile = tempFileList.get(index);
List
int byteSum = 0;
// 循环临时文件并循环大文件
while ((line = br.readLine()) != null) {
line += "\n";
byteSum += line.getBytes().length;
// 如果长度达到每个文件大小则重新计算
if (byteSum >= byteSize) {
// 写入到文件
putLineListToFile(tempFileList.get(index), lineList);
index--;
byteSum = line.getBytes().length;
lineList.clear();
}
lineList.add(line);
}
if (lineList.size() > 0) {
// 写入到文件
putLineListToFile(tempFileList.get(0), lineList);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (br != null) {
br.close();
}
} catch (IOException e) {
}
}
/**
* 把数据写到临时文件
* @param lineList
*/
private static void putLineListToFile(File file, List
try {
// 很关键的一步,第一次写入文件必须排序
lineList.sort(new LineComparator());
tempFileFos = new FileOutputStream(file);
for(int i=0; i< lineList.size(); i++) {
tempFileFos.write(lineList.get(i).getBytes());
}
} finally {
if (tempFileFos != null) {
tempFileFos.close();
}
}
}
/**
* 生成临时文件
* @param fileNum
* @return
*/
private static List
List
String fileFolder = file.getParent();
String name = file.getName();
for (int i = 0; i < fileNum; i++) {
File tempFile = new File(fileFolder + "/" + name + ".temp_" + i + ".txt");
if (tempFile.exists()) {
tempFile.delete();
}
try {
tempFile.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
tempFileList.add(tempFile);
}
return tempFileList;
}
public static int compare(String o1, String o2) {
String o1Id = o1.substring(0, o1.indexOf("@@"));
String o2Id = o2.substring(0, o2.indexOf("@@"));
// 从小到大
return Integer.parseInt(o1Id) - Integer.parseInt(o2Id);
// 从大到小
// return Integer.parseInt(o2Id) - Integer.parseInt(o1Id);
}
}
/**
* 排序
*/
class LineComparator implements Comparator
@Override
public int compare(String o1, String o2) {
return https://www.360docs.net/doc/117896929.html,pare(o1, o2);
}
}
/**
* 排序类
*/
class FileEntityComparator implements Comparator
public int compare(FileEntity o1, FileEntity o2) {
return https://www.360docs.net/doc/117896929.html,pare(o1.getLine(), o2.getLine());
}
}
class FileEntity {
private Long id = null;
private String line = null;
private BufferedReader br;
public FileEntity(BufferedReader br) throws IOException {
this.br = br;
// 初始化读取第一行
setLineId();
}
/**
* 使用来排序的数据
* @throws IOException
*/
private void setLineId() throws IOException {
line = br.readLine();
if (line != null) {
try {
id = Long.parseLong(line.substring(0, line.indexOf("@@")));
} catch (NumberFormatException e) {
id = null;
}
}
}
/**
* 关闭流
*/
public void close() {
if (this.br != null) {
try {
this.br.close();
} catch (Exception e) {
}
}
}
/**
* 读取下一行
* @return
*/
public FileEntity nextLine() {
try {
setLineId();
} catch (IOException e) {
}
return this;
}
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getLine() {
return line;
}
}