今儿自己弄了个超大文件读取并排序,结果问题来了,效率很低,14G文件,我读了20分钟才读了不到1g。
需求:读取文件内容,并按照第一个字段排序,数据大概有几十亿条。
文件格式如下:
12312312312,33344,abb
435312312,33344,abb
1342312314,33344,abb
1312315,33344,abb
333123121231,33344,abb
63123125432,33344,abb
32312312312,33344,abb
11123123789,33344,abb
9623125672,33344,abb
6412095,33344,abb
312353,33344,abb
1131297,33344,abb
23231208,33344,abb
781297,33344,abb
12312387,33344,abb
43531256,33344,abb
13423354,33344,abb
1312300,33344,abb
33312312375,33344,abb
631232,33344,abb
323123144,33344,abb
111231287,33344,abb
96231231,33344,abb
6412344,33344,abb
12312312343
52131231
867
8121231
4523
代码如下:
/**
* Created with IntelliJ IDEA.
* User: 菜鸟大明
* Date: 14-8-21
* Time: 下午7:25
* To change this template use File | Settings | File Templates.
*/
public class BigDataDeal {
String resultPath = "D:\\file\\result.txt";
String filePath = "D:\\temp.txt";
String createFilePath = "D:\\file\\";
public static void main(String[] args) throws IOException {
BigDataDeal b = new BigDataDeal();
b.readFile();
}
public void readFile() throws IOException {
FileChannel fw = new RandomAccessFile(filePath,"rw").getChannel();
Scanner scaner = new Scanner(fw);
// 3M 以读取
ByteBuffer buf = ByteBuffer.allocate(1024*3);
while (scaner.hasNext()) {
buf.flip();
String line = scaner.nextLine();
int no;
if (line.split(",")[0].length() == 1) {
no = Integer.valueOf(line.split(",")[0]);
} else {
no = Integer.valueOf(line.split(",")[0].substring(0, 2));
}
FileChannel fr = new RandomAccessFile(createFilePath + no,"rw").getChannel();
fr.write(ByteBuffer.wrap(line.getBytes()), fr.size());
fr.write(ByteBuffer.wrap("\r\n".getBytes()),fr.size());
buf.clear();
fr.close();
}
fw.close();
FileChannel fw2 = new RandomAccessFile(resultPath,"rw").getChannel();
fw2.truncate(0);
// 循环每个文件
for (int i = 0; i < 100; i++) {
if (!(new File(createFilePath + i)).exists()) {
continue;
}
// 排序
List<String> list = new ArrayList<String>();
FileChannel fr2 = new RandomAccessFile(createFilePath + i,"rw").getChannel();
Scanner sc = new Scanner(fr2);
// 将每个文件每行存入到内存中
while (sc.hasNext()) {
list.add(sc.nextLine());
}
// 排序
Collections.sort(list);
for (String line : list) {