java判断html表格是不是标准表格
本帖最后由 1664593601 于 2019-5-14 23:27 编辑标准表格,就是每个tr的td长度一致:有clospan的,rowspan的,合并的个数计算。
package test.isEquals;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ReadHtml {
public static void main(String[] args) {
System.out.println("start!");
traverseFolder2("C:/Users/cylyc/Desktop/test/newhtm");
System.out.println("end!");
}
// 获得需要的文件类型
public static void traverseFolder2(String path) {
File file = new File(path);
if (file.exists()) {
File[] files = file.listFiles();
if (null == files || files.length == 0) {
return;
} else {
for (File file2 : files) {
if (file2.isDirectory()) {
traverseFolder2(file2.getAbsolutePath());
} else {
String absolutePath = file2.getAbsolutePath();
if (absolutePath.endsWith(".html")
|| absolutePath.endsWith(".htm")) {
//System.out.println("当前检测文件名为:"+absolutePath);
String txt2String = txt2String(absolutePath);
if (!isContainsTable(txt2String)) {
System.out.println("该文件名为:"+absolutePath);
}
}
}
}
}
} else {
System.out.println("文件路径不存在!");
}
}
// 读取文件
private static String txt2String(String fileName) {
BufferedReader br = null;
StringBuffer sb = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(
fileName), "UTF8"));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
System.out.println("文件编码错误:" + fileName);
} catch (FileNotFoundException e) {
e.printStackTrace();
System.out.println("文件找不到:" + fileName);
}
sb = new StringBuffer();
String line = null;
try {
while ((line = br.readLine()) != null) {
sb.append(line);
}
} catch (IOException e) {
e.printStackTrace();
System.out.println("IO异常" + fileName);
}
return new String(sb);
}
public static boolean isContainsTable(String content) {
boolean returnFlag = true;
Document doc = Jsoup.parse(content);
Elements tables = doc.getElementsByTag("table");
if (tables != null && tables.size() > 0) {
//表格编号
int tableNum = 0;
for (Element table : tables) {
//确定数组的大小
//获得i的最大值
int trSize = table.getElementsByTag("tr").size();
//获得j的最大值
int tdSize = table.getElementsByTag("td").size();
boolean[][] colBytes = new boolean;
tableNum++;
int maxColLength = 0;
int i = 0;
Elements trs = table.getElementsByTag("tr");
for (Element tr : trs) {
Elements tds = tr.getElementsByTag("td");
for (Element td : tds) {
int j = 0;
String col = td.attr("colspan");
col = StringUtils.isBlank(col) ? "1" : col;
String row = td.attr("rowspan");
row = StringUtils.isBlank(row) ? "1" : row;
for (int tempJ = 0; tempJ < Integer.valueOf(col); tempJ++, j++) {
while (true) {
if (colBytes) {
j++;
}else {
for (int tempI = 0; tempI < Integer.valueOf(row); tempI++) {
colBytes = true;
}
break;
}
}
}
// 记录行的最大值
maxColLength = j > maxColLength ? j : maxColLength;
}
i++;
}
for (int q = 0; q < i; q++) {
if (!colBytes) {
System.out.println("第"+tableNum+"个表格不是标准表格");
returnFlag = false;
break;
}
}
}
}
return returnFlag;
}
}
滴滴滴,来看看。 yaohuo大兄dei 来了 嘤嘤嘤 正在努力正在努力 楼主我想知道你的项目是什么奇葩需求要求验证html表格完整度 SGC沉默 发表于 2019-2-26 13:06
楼主我想知道你的项目是什么奇葩需求要求验证html表格完整度
这批html文档是从Word转过来的,公式还是字符串呢, 帮你了你自己看着办吧
页:
[1]