学习POI对excel中xls文件读较大数据的两种方法数据处理的方法和速度对比
本帖最后由 象相合 于 2018-5-17 15:59 编辑大数据的excel读写不像小数据,对于读写的方法,数据精度,数据冗余度,数据稳定性都有相当的要求。
于是在网上搜到一份参考资料,沿着链接跳进去得到另一份:
前置资料:https://www.cnblogs.com/tootwo2/p/6683143.html
http://poi.apache.org/spreadsheet/how-to.html
但是,这个人并没有提供相应的速度评测,文章内容也是从POI官网内容捞下来的,实在没有什么可比性。【= =于是就有了本文:
在这里,只讨论两种模式:1、eventmodel,只读,CPU和内存消耗非常低。事件监听实现。2、usermodel,可读可写,CPU和内存消耗非常大。普通模式,直接获取实现。
这两种模式读取excel的时间格式都是double型,因此读取时还需要对数据整理:
先上代码,再作解释:
a). 配置maven依赖:
<!-- 读取execl 文件 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.15</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.15</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml-schemas -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.15</version>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.15</version>
</dependency>
<dependency>
<groupId>xerces</groupId>
<artifactId>xerces</artifactId>
<version>2.4.0</version>
</dependency>
b). XlsReadWriteMethod:普通的usermodel模式读取excel
package excel.POI;
import org.apache.commons.lang.time.DateFormatUtils;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.ss.usermodel.*;
import java.io.File;
import java.io.FileInputStream;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
/**
* 可读可写的Xls/Xlsx
* */
public class XlsReadWriteMethod {
private String filePath;
XlsReadWriteMethod(String filePath) {
this.filePath = filePath;
}
public static void main(String[] args) {
try {
Long start = System.currentTimeMillis();
String filePath = "F:/temp/excel/1.xls";
XlsReadWriteMethod xlsReadWriteMethod = new XlsReadWriteMethod(filePath);
Iterator iter = xlsReadWriteMethod.RowIterator(0);
Long end = System.currentTimeMillis();
System.out.println("get data time = " + (end - start));
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 迭代FileBean的每一行
* */
public Iterator<String[]> RowIterator(int sheetAt) throws Exception {
Workbook workbook = WorkbookFactory.create(new FileInputStream(new File(filePath)));
Sheet sheet = workbook.getSheetAt(sheetAt); //get index of sheet
List<String[]> list = new LinkedList<String[]>();
for (int i = 0; i < sheet.getLastRowNum(); i++) {
Row row = sheet.getRow(i);
if(null == row)continue; // 这一行是空行:没东西
int firstCellNum = row.getFirstCellNum();
int lastCellNum = row.getLastCellNum();
String[] oneRowValue = new String;
int countNoneCol = 0;
for (int j = firstCellNum; j < lastCellNum; j++) {
Cell cell = row.getCell(j);
int thisCol = j - firstCellNum;
if(null == cell) {
oneRowValue = "";
continue;
}
switch (cell.getCellType()) // Cell.getCellType() has not deprecated in API
{
// Because of Cell.getCellType().getCode() has deprecated And do not provide a replacement way So just use direct ways
case -1 : // _NONE
oneRowValue = "";
break;
case 0 : // NUMERIC Date
short dataFormat = cell.getCellStyle().getDataFormat();
// 31/176 : yyyy年m月d日(自定义型和日期型) , 58 : m月d日 , 32 : h时mm分
// 有时候176也是数字型,那这样会报错,干脆直接抓个异常。总之抓一堆dateFormat来兼容就好了。
if(HSSFDateUtil.isCellDateFormatted(cell) || HSSFDateUtil.isCellDateFormatted(cell)
|| 31 == dataFormat || 58 == dataFormat || 32 == dataFormat || 176 == dataFormat) {
try {
oneRowValue = DateFormatUtils.format(cell.getDateCellValue(), "yyyy-MM-dd hh:MM:ss");
}catch ( Exception e) {
try {
cell.setCellType(CellType.STRING);
}catch (Exception ex) {
//出问题就不转
}
oneRowValue = String.valueOf(cell.getStringCellValue());
}
} else {
try {
cell.setCellType(CellType.STRING);
}catch (Exception ex) {
//出问题就不转
}
oneRowValue = String.valueOf(cell.getStringCellValue());
}
break;
case 1 : // STRING
oneRowValue = cell.getStringCellValue();
break;
case 2 : // FORMULA
try {
oneRowValue = String.valueOf(cell.getNumericCellValue());
} catch (IllegalStateException e) {
try {
oneRowValue = String.valueOf(cell.getRichStringCellValue());
} catch (Exception ex){
oneRowValue = "";
}
}
break;
case 3 : // BLANK
oneRowValue = "";
break;
case 4 : // BOOLEAN
oneRowValue = String.valueOf(cell.getBooleanCellValue());
break;
case 5 : // ERROR
oneRowValue = String.valueOf(cell.getErrorCellValue());
break;
default :
oneRowValue = "";
break;
} // end switch
if("".equals(oneRowValue)) countNoneCol++;
} // end for
if(countNoneCol == (lastCellNum - firstCellNum)) continue;//这一行是空行,里面的东西都无效
list.add(oneRowValue);
} // end for 2
return list.iterator();
}
}
c). XlsReadOnlyMethod:eventmodel的模式,它是监听器模式,读文件的时候不停记录读取的记录,根据不同的记录特征读取不同的值。
package excel.POI;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.*;
import org.apache.poi.hssf.record.*;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
/**
* 只可读类型的XlsMethod,比可读写快(比例:20/13
*/
public class XlsReadOnlyMethod implements HSSFListener {
private SSTRecord sstRec = null;
private List<String[]> iterList = new LinkedList<String[]>();
private String[] tmpStr_ = null;
private int colLastRealIndex = 0; //总列
private int rowLastRealIndex = 0; //总行
private int thisRow = 0; // 当前行指针
private boolean helpThisRow = true;
private boolean shouldInit = true;
private static int sheetAt = 0; // 传入的sheet,不用static会丢失值
private int count = 0; // 用来计数是第几个sheet
private boolean isActiveSheet = false; // 用来标记是第几个sheet
private static FormatTrackingHSSFListener formatListener;
public static void main(String[] args) {
try {
Long start = System.currentTimeMillis();
String filePath = "F:/temp/excel/1.xls";
XlsReadOnlyMethod xlsReadOnlyMethod = new XlsReadOnlyMethod();
Iterator iter = xlsReadOnlyMethod.RowIterator(filePath, 0);
Long end = System.currentTimeMillis();
System.out.println("spending time = " + (end - start));
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* sheet想调哪张调哪张。操作流程:
*1. 构造HSSFRequest对象
*2. 使用输入流让事件工厂运作,直接监听到读取数据的事件,在监听processRecord中拿数据。
*/
public Iterator<String[]> RowIterator(String filePath,int sheetAt) throws IOException {
this.sheetAt = sheetAt;
FileInputStream fin = new FileInputStream(filePath);
InputStream inputStream = new POIFSFileSystem(fin).createDocumentInputStream("Workbook");
HSSFRequest hssfRequest = new HSSFRequest();
// hssfRequest.addListenerForAllRecords(new XlsReadOnlyMethod(this.getSource())); // add listener
HSSFEventFactory hssfEventFactory = new HSSFEventFactory();
// 使用formatListener可以分别获得date和double
MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
formatListener = new FormatTrackingHSSFListener(listener);
hssfRequest.addListenerForAllRecords(formatListener);
hssfEventFactory.processEvents(hssfRequest, inputStream);
if(null != tmpStr_)
if(0 != tmpStr_.length) {
iterList.add(tmpStr_);
}
fin.close();
inputStream.close();
return iterList.iterator();
}
/**
*实现HSSFListener的方法,【监听器】获取xls的内容
*eg : http://poi.apache.org/spreadsheet/how-to.html
* */
/**
* 1. 初始化
* 获取一行所有列
* 设置【当前行指针】
* 初始化tmpStr_
* 设置初始化在每行只走一次 === 3/2
* 2. 获取值
* 把值丢到tmpStr_里(空的就不丢
* 3. 换行
* tmpStr_加到iterList里去
* 如果【当前行指针】跳到下一行,我就换行,并初始化1/3
* 4. 循环
* 到最后一行的时候跳出来,我再执行一边添加。
* */
public void processRecord(Record record) {
switch (record.getSid())
{
// the BOFRecord can represent either the beginning of a sheet or the workbook
case BOFRecord.sid:
BOFRecord bof = (BOFRecord) record;
if (bof.getType() == bof.TYPE_WORKSHEET) { //遇到sheet
if(count++ == sheetAt) {
isActiveSheet = true;
} else {
isActiveSheet = false;
}
}
break;
case RowRecord.sid:// 用来初始化当前行有几个cell
if(isActiveSheet) {
if(null != tmpStr_) {
iterList.add(tmpStr_);
}
RowRecord rowrec = (RowRecord) record;
// 可以获得行列的最大值。 如果有错就给后面的数字加大。
colLastRealIndex = rowrec.getLastCol() - rowrec.getFirstCol() + 5;
rowLastRealIndex = rowrec.getRowNumber();
if(helpThisRow) { //初始化当前行指针
thisRow = rowrec.getRowNumber();
shouldInit = true;
helpThisRow = false;
}
// System.out.println("我要判定 " + rowrec.getRowNumber() + " 行有" + (colLastRealIndex)+ "个cell了");
}
break;
/**
*不管是哪条路,只要第一次跑到 就开始
* */
case NumberRecord.sid: // Date int double
if(isActiveSheet) {
NumberRecord numrec = (NumberRecord) record;
//1.如果是下一行,2.如果是第一次运行,我就初始化
checkNextRowAndInit(numrec);
tmpStr_ = formatListener.formatNumberDateCell(numrec);
// System.out.println("Cell found with value " + formatListener.formatNumberDateCell(numrec)
// + " at row " + numrec.getRow() + " and column " + numrec.getColumn());
}
break;
case FormulaRecord.sid:// formula
if(isActiveSheet) {
FormulaRecord forRec = (FormulaRecord) record;
checkNextRowAndInit(forRec);
tmpStr_ = String.valueOf(forRec.getValue());
// System.out.println("Fomula found with value " + forRec.getValue() + " at row " + forRec.getRow() + " and column " + forRec.getColumn());
}
break;
// SSTRecords store a array of unique strings used in Excel.
case SSTRecord.sid:
sstRec = (SSTRecord) record; // to get SSTRecord
break;
case LabelSSTRecord.sid:
if(isActiveSheet) {
LabelSSTRecord lrec = (LabelSSTRecord) record;
checkNextRowAndInit(lrec);
tmpStr_ = String.valueOf(sstRec.getString(lrec.getSSTIndex()));
// System.out.println("String cell found with value "
// + sstRec.getString(lrec.getSSTIndex()) + " at row " + lrec.getRow()
// + " and column " + lrec.getColumn());
}
break;
}
}
private void checkNextRowAndInit(CellRecord record) {
if(thisRow != record.getRow()) { // 已经到下一行了
thisRow = record.getRow();
iterList.add(tmpStr_);
shouldInit = true;
}
if(shouldInit) {
tmpStr_ = new String;
shouldInit = false;
}
}
}
两种模式由于驱动方式不同,获取数据时的方式和对数据的处理方式也不同。
对于37.9M的xls文件,两者读取速度对比如下:
ReadOnlyMethod : spending time = 4438
ReadWriteMethod : get data time = 7056
对于45M的xls文件,两者读取速度对比如下:
ReadOnlyMethod : spending time = 13456
ReadWriteMethod : get data time = 18136
显然,只读方法不仅耗费更少的资源,而且速度也更快,因此如果没有对文件写的操作,使用只读方法更为优秀。(^o^
文件github : https://github.com/EleComb/maven_test
最后祝大家学习进步!
谢谢分享 十分感谢分享。 额,这个代码看着有点...,可能是52破解不大支持发代码的原因吧 aspose 用过没有,感觉还是可以的 谢谢分享,正好要用到 谢谢楼主的分享
页:
[1]