JAVA实现大文件排序

package com.scott.util;

import java.io.*;

import java.util.ArrayList;

import https://www.360docs.net/doc/117896929.html,parator;

import java.util.Iterator;

import java.util.List;

/**

* Created by Scott on 2017/11/1.

public class LargeFileDataSort {

// 测试大文件路径

public final static String testFilePath = "E:/dataTest/largeFileData.txt";

public final static String resultFilePath = "E:/dataTest/largeFileResult.txt";

// 切分大文件的小文件大小MB, 默认为100MB

private final static int size = 200;

private static int byteSize = size * 1024 * 1024;

public static void main(String[] args) throws IOException {

// 生成测试文件

createTestData();

Long start = System.currentTimeMillis();

work();

Long end = System.currentTimeMillis();

System.out.println((end - start) / 1000/ 60);

}

/**

* 切分文件每份大小

public static void work() throws IOException {

File file = new File(testFilePath);

if (!file.exists()) {

return;

}

// 2.1 得到文件大小MB

double mbsize = file.length() / 1024 / 1024;

// 2.2 计算得到切分的文件数

double fileNum = Math.ceil(mbsize / size);

// 2.3 临时文件

List tempFileList = createTempFileList(file, fileNum);

// 2.3 切分文件

divAndFirstSort(file, tempFileList);

// 2.4 递归排序(每个文件读取多少数据放到内存排序后合并到结果文件)

// 排序合并开始

mergeLargeFile(tempFileList);

// 2.5 TODO 把临时文件删除

}

/**

* 生成测试文件

public static void createTestData() {

StringBuffer sb = new StringBuffer();

BufferedWriter bw = null;

try {

File testFile = new File(testFilePath);

if (!testFile.exists()) {

testFile.createNewFile();

}

bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(testFile)));

for (long i = 1; i <= 27777777; i++) {

sb.setLength(0);

sb.append(i).append("@@");

// sb.append(random.nextInt(100000)).append("@@");

sb.append(i).append("调用BufferedWriter的flush()方法").append("@@");

sb.append("\n");

bw.write(sb.toString());

if ((i + 1) % 5000 == 0) {

bw.flush();

}

System.out.println(i);

}

} catch (IOException e) {

System.out.println("生成测试文件失败！" + e.getMessage());

} finally {

try {

if (bw != null) {

bw.close();

}

} catch (IOException e) {

}

/**

* 把临时文件合并到结果文件

* @param tempFileList

* @throws IOException

public static void mergeLargeFile(List tempFileList) throws IOException {

List bwList = new ArrayList();

for(int i=0; i< tempFileList.size(); i++) {

FileEntity le = new FileEntity(new BufferedReader(new InputStreamReader(new FileInputStream(tempFileList.get(i)))));

bwList.add(le);

}

BufferedWriter resultBw = null;

try {

resultBw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(resultFilePath)));

Long count = 0L;

FileEntity fe = null;

while ((fe = getFirstFileEntity(bwList)) != null) {

System.out.println("--- 写文件id: " + fe.getId());

// 写入符合条件的一行数据

resultBw.write(fe.getLine() + "\n");

// 准备下一行

fe.nextLine();

// 清缓冲流

if (count % 1000 == 0) {

resultBw.flush();

}

} catch (Exception e) {

} finally {

if (resultBw != null) {

try {

resultBw.close();

} catch (IOException e) {

}

// 关闭

for(int i=0; i< bwList.size(); i++) {

bwList.get(i).close();

}

/**

* 从切分的文件中按序行读取（因为切分文件时已经做好了排序）* @param bwList

* @return

private static FileEntity getFirstFileEntity(List bwList) { if (bwList.size() == 0) {

return null;

}

Iterator it = bwList.iterator();

while (it.hasNext()) {

FileEntity fe = it.next();

// 如果文件读到完就关闭流和删除在集合的文件流

if (fe.getLine() == null) {

fe.close();

it.remove();

}

if (bwList.size() == 0) {

return null;

}

// 排序获取一行数据

bwList.sort(new FileEntityComparator());

// 返回第一个符合条件的文件对象

return bwList.get(0);

}

/**

* 切分文件并做第一次排序

* @param file

* @param tempFileList

private static void divAndFirstSort(File file, List tempFileList) {

BufferedReader br = null;

try {

// 读取大文件

br = new BufferedReader(new InputStreamReader(new FileInputStream(file)));

// 行数据保存对象

String line = null;

// 临时文件索引

int index = tempFileList.size() - 1;

// 第一个临时文件

File tempFile = tempFileList.get(index);

List lineList = new ArrayList<>();

int byteSum = 0;

// 循环临时文件并循环大文件

while ((line = br.readLine()) != null) {

line += "\n";

byteSum += line.getBytes().length;

// 如果长度达到每个文件大小则重新计算

if (byteSum >= byteSize) {

// 写入到文件

putLineListToFile(tempFileList.get(index), lineList);

index--;

byteSum = line.getBytes().length;

lineList.clear();

}

lineList.add(line);

}

if (lineList.size() > 0) {

// 写入到文件

putLineListToFile(tempFileList.get(0), lineList);

}

} catch (FileNotFoundException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

} finally {

try {

if (br != null) {

br.close();

}

} catch (IOException e) {

}

/**

* 把数据写到临时文件

* @param lineList

private static void putLineListToFile(File file, List lineList) throws IOException { FileOutputStream tempFileFos = null;

try {

// 很关键的一步，第一次写入文件必须排序

lineList.sort(new LineComparator());

tempFileFos = new FileOutputStream(file);

for(int i=0; i< lineList.size(); i++) {

tempFileFos.write(lineList.get(i).getBytes());

}

} finally {

if (tempFileFos != null) {

tempFileFos.close();

}

/**

* 生成临时文件

* @param fileNum

* @return

private static List createTempFileList(File file, double fileNum) {

List tempFileList = new ArrayList();

String fileFolder = file.getParent();

String name = file.getName();

for (int i = 0; i < fileNum; i++) {

File tempFile = new File(fileFolder + "/" + name + ".temp_" + i + ".txt");

if (tempFile.exists()) {

tempFile.delete();

}

try {

tempFile.createNewFile();

} catch (IOException e) {

e.printStackTrace();

}

tempFileList.add(tempFile);

}

return tempFileList;

}

public static int compare(String o1, String o2) {

String o1Id = o1.substring(0, o1.indexOf("@@"));

String o2Id = o2.substring(0, o2.indexOf("@@"));

// 从小到大

return Integer.parseInt(o1Id) - Integer.parseInt(o2Id);

// 从大到小

// return Integer.parseInt(o2Id) - Integer.parseInt(o1Id);

}

/**

* 排序

class LineComparator implements Comparator {

@Override

public int compare(String o1, String o2) {

return https://www.360docs.net/doc/117896929.html,pare(o1, o2);

}

/**

* 排序类

class FileEntityComparator implements Comparator { @Override

public int compare(FileEntity o1, FileEntity o2) {

return https://www.360docs.net/doc/117896929.html,pare(o1.getLine(), o2.getLine());

}

class FileEntity {

private Long id = null;

private String line = null;

private BufferedReader br;

public FileEntity(BufferedReader br) throws IOException {

this.br = br;

// 初始化读取第一行

setLineId();

}

/**

* 使用来排序的数据

* @throws IOException

private void setLineId() throws IOException {

line = br.readLine();

if (line != null) {

try {

id = Long.parseLong(line.substring(0, line.indexOf("@@")));

} catch (NumberFormatException e) {

id = null;

}

/**

* 关闭流

public void close() {

if (this.br != null) {

try {

this.br.close();

} catch (Exception e) {

}

/**

* 读取下一行

* @return

public FileEntity nextLine() {

try {

setLineId();

} catch (IOException e) {

}

return this;

}

public Long getId() {

return id;

}

public void setId(Long id) {

this.id = id;

}

public String getLine() {

return line;

}