mapreduce怎么筛选空字符
本帖最后由 牵手丶若相惜 于 2019-10-29 18:24 编辑60.174.83.183|q11.cnzz.com|20170207161935|42.120.219.31|0
60.174.83.183|textlink.simba.taobao.com|20170207161935|140.205.62.20|0
60.174.83.184|www.taobao.com|20170207161935|124.112.127.48|0
60.174.83.226|p3.ssl.qhimg.com|20170207161935|0
60.174.83.238|show-s.mediav.com|20170207161935|180.163.255.159|0
60.174.83.181|q11.cnzz.com|20170207161935|42.120.219.31|0
60.174.83.182|textlink.simba.taobao.com|140.205.62.20|0
74.125.77.73|89.110.170.60.in-addr.arpa|20170207161935||3
60.174.83.195|www.taobao.com|20170207161935|124.112.127.48|0
60.174.83.200|p3.ssl.qhimg.com|20170207161935|101.227.5.22;101.227.5.23|0
60.174.83.238|show-s.mediav.com|20170207161935|180.163.255.159|0
60.174.83.156|q11.cnzz.com|20170207161935|42.120.219.31|0
74.125.80.70|144.107.102.114.in-addr.arpa|20170207161935||3
60.174.83.152|textlink.simba.taobao.com|20170207161935|140.205.62.20|0
60.174.83.182|www.taobao.com|201702071619350|0
60.174.83.226|p3.ssl.qhimg.com|201702071619350|0
60.174.83.231|show-s.mediav.com|20170207161935|180.163.255.159|0
61.220.10.193|242.26.103.114.in-addr.arpa|20170207161935||3
89.67.84.50|85.235.178.220.in-addr.arpa|20170207161935||3
要求:将字段个数不满足5个的数据过滤掉,并且将网站地址中为 ”www.taobao.com” 的标记为购物网替换为“ShoppingAction ”,最后将清洗过滤后的数据全部输出
分隔符:|
我只能把字段不满足5个的给筛选掉 ,其他的解决不了 求大神
我用for历遍判断,还是判断不出来
spark SQL translate 转换
foreach 本帖最后由 15212520947 于 2019-10-3 16:09 编辑
Mapper端
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMapper extends Mapper<LongWritable, Text, MyBean, NullWritable> {
MyBean bean = new MyBean();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, MyBean, NullWritable>.Context context)
throws IOException, InterruptedException {
String values = value.toString();
String[] words = values.split("\\|");
if(words.length>5) {
if (words.equals("www.taobao.com")) {
words = "ShoppingAction";
bean.setBean(words, words, words, words, words);
context.write(bean, NullWritable.get());
} else {
bean.setBean(words, words, words, words, words);
context.write(bean, NullWritable.get());
}
}
}
}
Reducer端
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class MyReducer extends Reducer<MyBean, NullWritable, MyBean, NullWritable> {
@Override
protected void reduce(MyBean key, Iterable<NullWritable> value,
Reducer<MyBean, NullWritable, MyBean, NullWritable>.Context context)
throws IOException, InterruptedException {
for (NullWritable values : value) {
context.write(key, values);
}
}
}
自建的类MyBean
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class MyBean implements WritableComparable<MyBean> {
String ip;
String addres;
String data;
String ips;
String zero;
public MyBean() {
}
public void setBean(String ip, String addres, String data, String ips, String zero) {
this.ip = ip;
this.addres = addres;
this.data = data;
this.ips = ips;
this.zero = zero;
}
public MyBean(String ip, String addres, String data, String ips, String zero) {
this.ip = ip;
this.addres = addres;
this.data = data;
this.ips = ips;
this.zero = zero;
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public String getAddres() {
return addres;
}
public void setAddres(String addres) {
this.addres = addres;
}
public String getData() {
return data;
}
public void setData(String data) {
this.data = data;
}
public String getIps() {
return ips;
}
public void setIps(String ips) {
this.ips = ips;
}
public String getZero() {
return zero;
}
public void setZero(String zero) {
this.zero = zero;
}
@Override
public String toString() {
return ip + "\t" + addres + "\t" + data + "\t" + ips + "\t" + zero;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(ip);
out.writeUTF(addres);
out.writeUTF(data);
out.writeUTF(ips);
out.writeUTF(zero);
}
@Override
public void readFields(DataInput in) throws IOException {
this.ip = in.readUTF();
this.addres = in.readUTF();
this.data = in.readUTF();
this.ips = in.readUTF();
this.zero = in.readUTF();
}
@Override
public int compareTo(MyBean o) {
// TODO Auto-generated method stub
return 0;
}
}
15212520947 发表于 2019-10-3 16:08
Mapper端
import java.io.IOException;
这个问题已经解决了
是因为" | "分隔符 需要\\来转义
但是还是谢谢{:1_893:} 牵手丶若相惜 发表于 2019-10-4 22:55
这个问题已经解决了
是因为" | "分隔符 需要\\来转义
但是还是谢谢
我也刚学没多久,这题顺便做到了:lol 你这几个要求在map阶段其实就直接可以处理掉。public class NewTest {
private static class MyMapper extends Mapper<Object, Text, Text, NullWritable> {
@Override
protected void map(Object k1, Text v1, Context context) throws IOException, InterruptedException {
String line = v1.toString();
String data = line.replace("www.taobao.com", "ShoppingAction");
String words[] = data.split("\\|");
if (words.length == 5) {
for (int i = 0; i < words.length; i++) {
if (words.isEmpty()) {
return;
}
}
context.write(new Text(data), NullWritable.get());
}
}
}
页:
[1]