本帖最后由 157239486 于 2021-9-3 17:22 编辑
前几日看到有坛友悬赏一个网站数据需要保存到excel的帖子,今日得闲抽空做出来,授人以鱼不如授人以渔,特把详细代码贴上来。
源代码地址:https://wwr.lanzoui.com/iPMT7tjqxhe
原悬赏贴地址:https://www.52pojie.cn/thread-1503464-1-1.html
开发环境:JDK1.8
开发工具:IDEA 2020.2.3
小作业:原站上面有个旧编码并未提取,有兴趣的可下载代码修改,自行尝试抓取。
创建java项目
基于Maven进行构建,推荐使用Maven来安装WebMagic。在项目中添加以下坐标即可:
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.0.1</version>
</dependency>
</dependencies>
创建Model类 Data (省去set,get)
public class Data {
private String id;
private String name;
}
创建WuErPageProcessor类(爬虫的配置、页面元素的抽取、链接的组装)
public class WuErPageProcessor implements PageProcessor {
//抓取网站的相关配置,包括抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(300).setTimeOut(5000).setCycleRetryTimes(3);
@Override
//process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
//根据网页调试分析-通过xpath方式先获得表格下的所有tr
//xpath 可自行百度使用方法
List<Selectable> nodes = page.getHtml().xpath("//*[@id=\"gridview\"]/tbody/tr").nodes();
List<Object> list = new ArrayList<Object>();
//循环tr
for (int i = 0; i < nodes.size() - 1; i++) {
//新建model,并把数据爬出来并赋值给对象
Data data = new Data();
data.setId(page.getHtml().xpath("//*[@id=\"gridview\"]/tbody/tr[" + (i + 2) + "]/td[2]/text()").get());
data.setName(page.getHtml().xpath("//*[@id=\"gridview\"]/tbody/tr[" + (i + 2) + "]/td[3]/text()").get());
//加入集合
list.add(data);
}
//数据保存,以便于Pipeline处理之后数据
page.putField("list", list);
//获取当前页码
int currPageNum = Integer.parseInt(page.getHtml().$("#hidCurrentPage", "value").get());
//新建一个请求,用于把下一页地址加入爬虫队列
Request request = new Request(page.getUrl().get());
//设置为post方式(从原网页分析点击下一页是通过post方式提交)
request.setMethod(HttpConstant.Method.POST);
//新建Map 用于存放form表单的提交数据
Map<String, Object> params = new HashMap<String, Object>();
//当前页码
params.put("hidCurrentPage", currPageNum);
//告知服务器是需要下一页的数据
params.put("__EVENTTARGET", "lbtNext");
//底下作用未知,但必须传,不传服务器返回500错误
params.put("hidWhere", "");
params.put("qry_productnumber", "");
params.put("qry_name", "");
params.put("qry_new_oldnumber", "");
params.put("__VIEWSTATE", "/wEPDwUKMjAxOTg4NTI2Nw9kFgICAw9kFggCCw88KwANAgAPFgQeC18hRGF0YUJvdW5kZx4LXyFJdGVtQ291bnQCD2QBEBYDAgECAgIDFgM8KwAFAQAWBB4KSGVhZGVyVGV4dAUM5aSH5Lu257yW56CBHglEYXRhRmllbGQFDXByb2R1Y3RudW1iZXI8KwAFAQAWBB8CBQzlpIfku7blkI3np7AfAwUEbmFtZTwrAAUBABYEHwIFCeaXp+e8lueggR8DBQ1uZXdfb2xkbnVtYmVyFgNmZmYWAmYPZBYgAgEPZBYIZg9kFgJmDxYCHgV2YWx1ZQVSNDU0ZGRmYjItZDJlNC1lNDExLTk0NDAtYWMxNjJkNzI4NzczeyR96L2u5byP6KOF6L295py6TFcxMTAwS+agh+WHhuWei+W6t+aYjuaWr+KFoWQCAQ8PFgIeBFRleHQFCDExMDBLTjAxZGQCAg8PFgIfBQUr6L2u5byP6KOF6L295py6TFcxMTAwS+agh+WHhuWei+W6t+aYjuaWr+KFoWRkAgMPDxYCHwUFBiZuYnNwO2RkAgIPZBYIZg9kFgJmDxYCHwQFPjI5MmYzZWVkLTE2OTktZTUxMS1iNDRhLWFjMTYyZDcyODc3M3skfVFZMjVLLjk3LTE05ZWG5qCH5qCH54mMZAIBDw8WAh8FBQkxMTEyMDE3ODBkZAICDw8WAh8FBRdRWTI1Sy45Ny0xNOWVhuagh+agh+eJjGRkAgMPDxYCHwUFCDExMjExNTk0ZGQCAw9kFghmD2QWAmYPFgIfBAVBMzYxNDlkMGUtYWQ1NC1lOTExLTg2NTQtYWMxNjJkNzI4NzczeyR9UVkzMEs1LUkuMDDmsb3ovabotbfph43mnLpkAgEPDxYCHwUFCTExMTUwMDAwNWRkAgIPDxYCHwUFGlFZMzBLNS1JLjAw5rG96L2m6LW36YeN5py6ZGQCAw8PFgIfBQUIMDIxMDAwMTFkZAIED2QWCGYPZBYCZg8WAh8EBT8yYTJmM2VlZC0xNjk5LWU1MTEtYjQ0YS1hYzE2MmQ3Mjg3NzN7JH1UMTEzRzAw56uL5L2T5qCH6K+GLVhDTUdkAgEPDxYCHwUFCTEzMDAwMDY1NmRkAgIPDxYCHwUFGFQxMTNHMDDnq4vkvZPmoIfor4YtWENNR2RkAgMPDxYCHwUFBiZuYnNwO2RkAgUPZBYIZg9kFgJmDxYCHwQFNzJiMmYzZWVkLTE2OTktZTUxMS1iNDRhLWFjMTYyZDcyODc3M3skfVhaMTZLLjI4LTjmoIfniYxkAgEPDxYCHwUFCTEzMDIwMTEwMWRkAgIPDxYCHwUFEFhaMTZLLjI4LTjmoIfniYxkZAIDDw8WAh8FBQgxMTIxMTU4NmRkAgYPZBYIZg9kFgJmDxYCHwQFQWJjMDU3ODVlLTllODQtZTkxMS1hZDcwLWFjMTYyZDcyODc3M3skfVFVWTU1LjA1LjEuMy0x5omO57q/5pSv5p62ZAIBDw8WAh8FBQkxNzAyMDA1MTFkZAICDw8WAh8FBRpRVVk1NS4wNS4xLjMtMeaJjue6v+aUr+aetmRkAgMPDxYCHwUFBiZuYnNwO2RkAgcPZBYIZg9kFgJmDxYCHwQFQWJkMDU3ODVlLTllODQtZTkxMS1hZDcwLWFjMTYyZDcyODc3M3skfVFVWTU1LjA1LjEuMy0y5omO57q/5pSv5p62ZAIBDw8WAh8FBQkxNzAyMDA1MTJkZAICDw8WAh8FBRpRVVk1NS4wNS4xLjMtMuaJjue6v+aUr+aetmRkAgMPDxYCHwUFBiZuYnNwO2RkAggPZBYIZg9kFgJmDxYCHwQFPzM3MTQ5ZDBlLWFkNTQtZTkxMS04NjU0LWFjMTYyZDcyODc3M3skfVFVWTcwLjA1LjEuMS0zOOieuuavjeadv2QCAQ8PFgIfBQUJMTcwNjAwMjA4ZGQCAg8PFgIfBQUYUVVZNzAuMDUuMS4xLTM46J665q+N5p2/ZGQCAw8PFgIfBQUIMDBFMDUwNTBkZAIJD2QWCGYPZBYCZg8WAh8EBTwzODE0OWQwZS1hZDU0LWU5MTEtODY1NC1hYzE2MmQ3Mjg3NzN7JH1RVVkxMDAuMDUuMS0zOELlvK/mnb9kAgEPDxYCHwUFCTE3MTEwMDQ4MmRkAgIPDxYCHwUFFVFVWTEwMC4wNS4xLTM4QuW8r+adv2RkAgMPDxYCHwUFCDA3MTA1MjI5ZGQCCg9kFghmD2QWAmYPFgIfBAU/MzkxNDlkMGUtYWQ1NC1lOTExLTg2NTQtYWMxNjJkNzI4NzczeyR9WEdINDAwLjA1LjEuMS4xMS0x5bqV5p2/ZAIBDw8WAh8FBQkxNzEyMDA0MjRkZAICDw8WAh8FBRhYR0g0MDAuMDUuMS4xLjExLTHlupXmnb9kZAIDDw8WAh8FBQYmbmJzcDtkZAILD2QWCGYPZBYCZg8WAh8EBT8zYTE0OWQwZS1hZDU0LWU5MTEtODY1NC1hYzE2MmQ3Mjg3NzN7JH1YR0g0MDAuMDUuMS4xLjExLTPogLPmnb9kAgEPDxYCHwUFCTE3MTIwMDQyNmRkAgIPDxYCHwUFGFhHSDQwMC4wNS4xLjEuMTEtM+iAs+adv2RkAgMPDxYCHwUFBiZuYnNwO2RkAgwPZBYIZg9kFgJmDxYCHwQFOjNiMTQ5ZDBlLWFkNTQtZTkxMS04NjU0LWFjMTYyZDcyODc3M3skfVFVWTQ1MC4wNS4xLjItNkHmnb9kAgEPDxYCHwUFCTE3NDUwMzMyM2RkAgIPDxYCHwUFE1FVWTQ1MC4wNS4xLjItNkHmnb9kZAIDDw8WAh8FBQYmbmJzcDtkZAIND2QWCGYPZBYCZg8WAh8EBUEzYzE0OWQwZS1hZDU0LWU5MTEtODY1NC1hYzE2MmQ3Mjg3NzN7JH1YR0gzMDBLLjA1LjEuMS0zMeWKoOW8uuadv2QCAQ8PFgIfBQUJMTgwMzAwMzI0ZGQCAg8PFgIfBQUaWEdIMzAwSy4wNS4xLjEtMzHliqDlvLrmnb9kZAIDDw8WAh8FBQYmbmJzcDtkZAIOD2QWCGYPZBYCZg8WAh8EBUEzZDE0OWQwZS1hZDU0LWU5MTEtODY1NC1hYzE2MmQ3Mjg3NzN7JH1YR0gzMDBLLjA1LjEuMS0zN+WKoOW8uuadv2QCAQ8PFgIfBQUJMTgwMzAwMzMwZGQCAg8PFgIfBQUaWEdIMzAwSy4wNS4xLjEtMzfliqDlvLrmnb9kZAIDDw8WAh8FBQYmbmJzcDtkZAIPD2QWCGYPZBYCZg8WAh8EBTszZTE0OWQwZS1hZDU0LWU5MTEtODY1NC1hYzE2MmQ3Mjg3NzN7JH1YR0gzMDBLLjA1LjEuMS00Meadv2QCAQ8PFgIfBQUJMTgwMzAwMzM0ZGQCAg8PFgIfBQUUWEdIMzAwSy4wNS4xLjEtNDHmnb9kZAIDDw8WAh8FBQYmbmJzcDtkZAIQDw8WAh4HVmlzaWJsZWhkZAINDw8WBB8FBQbpppbpobUeB0VuYWJsZWRoZGQCDw8PFgQfBQUJ5LiK5LiA6aG1HwdoZGQCEQ8PFgIfBQUH56ysMemhtWRkGAIFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYPBRtncmlkdmlldzpfY3RsMjpyb3dfc2VsZWN0b3IFG2dyaWR2aWV3Ol9jdGwzOnJvd19zZWxlY3RvcgUbZ3JpZHZpZXc6X2N0bDQ6cm93X3NlbGVjdG9yBRtncmlkdmlldzpfY3RsNTpyb3dfc2VsZWN0b3IFG2dyaWR2aWV3Ol9jdGw2OnJvd19zZWxlY3RvcgUbZ3JpZHZpZXc6X2N0bDc6cm93X3NlbGVjdG9yBRtncmlkdmlldzpfY3RsODpyb3dfc2VsZWN0b3IFG2dyaWR2aWV3Ol9jdGw5OnJvd19zZWxlY3RvcgUcZ3JpZHZpZXc6X2N0bDEwOnJvd19zZWxlY3RvcgUcZ3JpZHZpZXc6X2N0bDExOnJvd19zZWxlY3RvcgUcZ3JpZHZpZXc6X2N0bDEyOnJvd19zZWxlY3RvcgUcZ3JpZHZpZXc6X2N0bDEzOnJvd19zZWxlY3RvcgUcZ3JpZHZpZXc6X2N0bDE0OnJvd19zZWxlY3RvcgUcZ3JpZHZpZXc6X2N0bDE1OnJvd19zZWxlY3RvcgUcZ3JpZHZpZXc6X2N0bDE2OnJvd19zZWxlY3RvcgUIZ3JpZHZpZXcPPCsACgEIAgFkY8td5os8BGQR7cbZfIEOZG8sZU4=");
params.put("__VIEWSTATEGENERATOR", "82E2334D");
params.put("__EVENTVALIDATION", "/wEWGgK19d3zBAKEssiCCgKm6MrEAgK51ey4DgKftpqTAwLk2dU2Ap6Ep9gOAq2t7eYLApD0z+sFAuLm2YAFAuHmnf8IAu3mga8DAuzmhc4LAuzmieQBAuPmjYMKAu/m8dEEAu7m9egKAoGqlYQJAqyN/Z8MAuen9KkPAsrugdEMAu3T9KMJAqjk6wQC4/7ijgMC1ID+5gQC9YWt/g3g7SYtsAuhVSYoftWnfLo83ayLJw==");
//设置请求体
request.setRequestBody(HttpRequestBody.form(params, "utf-8"));
//加入爬虫队列
page.addTargetRequest(request);
}
@Override
public Site getSite() {
return site;
}
//主启动类
public static void main(String[] args) {
String uri = "http://58.218.196.218:47/ISV/XCMGKJPortal/Service/lookup.aspx?lk=srv_workorder_parts&where=&random=0.0487331111317425";
Spider.create(new WuErPageProcessor()).addUrl(uri).thread(5)//开启5个线程
.addPipeline(new ExcelPipeline("D:\\webtest"))//设置爬到数据后如何处理
.run();
}
}
新建ExcelPipeline类 (用于把爬取到的结果保存到excel)
public class ExcelPipeline extends FilePersistentBase implements Pipeline {
private String filename;//文件名
private int rows = 0;//当前要编辑的行
private HSSFWorkbook workbook;//工作蒲
private HSSFSheet sheet;//工作表
//构造方法
public ExcelPipeline(String path) {
//设置保存路径
setPath(path);
//设置文件名是日期格式
filename = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()) +".xls";
//创建工作薄对象
workbook = new HSSFWorkbook();//这里也可以设置sheet的Name
//创建工作表对象
sheet = workbook.createSheet("爬取结果");
//创建工作表的行
HSSFRow row = sheet.createRow(rows);
//创建标题
row.createCell(0).setCellValue("行号");
row.createCell(1).setCellValue("备件编码");
row.createCell(2).setCellValue("备件名称");
//每写完一行我们就要开始写下一行
rows++;
}
@Override
public void process(ResultItems resultItems, Task task) {
//提取保存的内容
List<Data> datas = resultItems.get("list");
for (int i=0;i<datas.size();i++){
//创建工作表的行
HSSFRow row = sheet.createRow(rows);
row.createCell(0).setCellValue(rows);
row.createCell(1).setCellValue(datas.get(i).getId());
row.createCell(2).setCellValue(datas.get(i).getName());
rows++;
}
//写完之后保存
save();
}
/** 保存表格 **/
private synchronized void save() {
try {
//文档输出
FileOutputStream out = new FileOutputStream(getFile(this.path+filename));
workbook.write(out);
out.close();
} catch (IOException e) {
}
}
}
结果示意图:
|