由于一个朋友工作的需要,需要统计很多文件里的数据,由于手工统计的工作量比较大,于是求助于我通过程序来统计。
需求:将所有数据从小到大合并成一个文件,重复数据只保留一个!并分别统计出前四位相同数据的个数!
数据片段:
19610618,
19610619,
19610622,
19610623,
19610718,
19980814,
19980820,
19980822,
19980831,
20040804,
20040809,
20040810,
20050405,
20050429,
20050619,
数据分散在不同的文件里面
/**
* @author Seave
* @since 2006-06-25
* @version 1.0
* 文件数据统计
* 将所有数据从小到大合并成一个文件,重复数据只保留一个!
* 并分别统计出前四位相同数据的个数!
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.io.IOException;
public class DataStatistic
{
//原始数据文件目录
private final static String directoryPath = "d://chengqiao//";
//统计后生成的文件
private final static String geneteratePath = "d://statisticResult.txt";
/**
* @author Seave
* @since 2006-06-25
* read file
* @return
*/
private static ArrayList readFile()
{
ArrayList al = new ArrayList();
File file = new File(directoryPath);
String fileList[] = file.list();
for(int i = 0;i < fileList.length; i++)
{
//得到文件名,放在list数组里
al.add(fileList[i]);
}
return al;
}
/**
* @author Seave
* @since 2005-06-25
* combin file,同时去掉重复数据
*
*/
private static ArrayList combinData()
{
ArrayList alLine = new ArrayList();
try
{
ArrayList al = readFile();
for(int i = 0;i < al.size();i++)
{
String fileName = (String)al.get(i);
FileInputStream fin = new FileInputStream(directoryPath+fileName);
BufferedReader bRead = new BufferedReader(new InputStreamReader(fin));
String line = "";
while ((line = bRead.readLine())!= null)
{
if(alLine.contains(line))
{
//已经存入,则不add进去了
continue;
}
alLine.add(line);
}
fin.close();
bRead.close();
}
}
catch(IOException ioe)
{
ioe.printStackTrace();
}
return alLine;
}
/**
* @author Seave
* @since 2005-06-25
* 排序,从小到大排序
*
*/
private static ArrayList sortData()
{
//存放排序后的数据
ArrayList alSort = new ArrayList();
//需要排序的数据
ArrayList al = combinData();
//clone ArrayList Data
ArrayList alClone = (ArrayList)al.clone();
while(al.size() > 0)
{
String minData = (String)al.get(0);
int tempIndex = 0;
if(alSort.contains(minData))
{
//如果此数据已经排序,则删除此数据,同时重新开始特循环
al.remove(minData);
continue;
}
for(int j = 0;j < alClone.size();j++)
{
String tempData = (String)alClone.get(j);
if(minData.compareTo(tempData) > 0)
{
//当前数据同克隆数组里的数据相比较
minData = tempData;
tempIndex = j;
}
}
// System.out.println("minData result = " + minData);
// System.out.println("================================");
alSort.add(minData);
//当得到最小的数据时,把此次循环得到的最小数据移出克隆数组
alClone.remove(tempIndex);
}
return alSort;
}
/**
* @author Seave
* @since 2005-06-25
* 分别统计出前四位相同数据的个数
*
*/
private static ArrayList statisticData()
{
ArrayList fourAl = new ArrayList();
ArrayList fourAndCountAl = new ArrayList();
ArrayList al = sortData();
for(int i = 0;i < al.size();i++)
{
String alStr = (String)al.get(i);
//获取前四位
String fourStartStr = alStr.substring(0,4);
if(fourAl.contains(fourStartStr))
continue;
fourAl.add(fourStartStr);
}
for(int i = 0;i < fourAl.size();i++)
{
String year = (String)fourAl.get(i);
int count = 0;
for(int j = 0;j < al.size();j++)
{
String alStr = (String)al.get(j);
String fourStartStr = alStr.substring(0,4);
if(fourStartStr.equals(year))
{
//计算相同的年份的数量
count++;
}
}
fourAndCountAl.add(year+":" + count+"条数据");
}
return fourAndCountAl;
}
/**
* @author Seave
* @since 2005-06-25
* 写文件
*
*/
private static void writeFile(ArrayList al)
{
try
{
File f = new File(geneteratePath);
if(!f.exists())
f.createNewFile();
FileOutputStream fout = new FileOutputStream(geneteratePath);
for(int i = 0; i < al.size();i++)
{
byte b[] = new byte[al.size()];
b = new String((String)al.get(i)+"/n").getBytes();
//写入文件
fout.write(b);
}
fout.flush();
//close FileOutputStream
fout.close();
}
catch(IOException ioe)
{
ioe.printStackTrace();
}
}
public static void main(String[] args)
{
ArrayList alSort = sortData();
ArrayList alStatic = statisticData();
for(int i = 0;i < alSort.size();i++)
{
//组织要同一个list里
alStatic.add(alSort.get(i));
}
writeFile(alStatic);
}
}
最后的统计结果片段:
1961:66条数据
1962:61条数据
1963:62条数据
1964:64条数据
1998:52条数据
1999:34条数据
2000:42条数据
2001:42条数据
2002:40条数据
2003:44条数据
2004:45条数据
2005:48条数据
19610618,
19610619,
19610622,
19610623,
20040827,
20041215,
20050310,
20050311,