MapReduce高阶训练
一、上网流量统计
数据格式如下:
二、需求:统计求和
统计每个手机号的上行数据包数总和,下行数据包数总和,上行总流量之和,下行总流量之和
分析:以手机号码作为key值,上行数据包,下行数据包,上行总流量,下行总流量四个字段作为value值,然后以这个key和value作为map阶段的输出,reduce阶段的输入。
1、思路分析
2、代码实现
第一步:自定义map的输出value对象FlowBean
import org.apache.hadoop.io.Writable;import org.apache.hadoop.io.WritableComparable;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;//如果MR中的JavaBean不是作为K2,则只需要实现Writable接口即可public class FlowBean implements Writable{ private Integer upFlow; //上行包数 private Integer downFlow; //下行包数 private Integer upCountFlow; //上行流量和 private Integer downCountFlow; //下行流量和 public FlowBean() { } public FlowBean(Integer upFlow, Integer downFlow, Integer upCountFlow, Integer downCountFlow) { this.upFlow = upFlow; this.downFlow = downFlow; this.upCountFlow = upCountFlow; this.downCountFlow = downCountFlow; } public Integer getUpFlow() { return upFlow; } @Override public String toString() { return upFlow + "\t" + downFlow + "\t" + upCountFlow + "\t" + downCountFlow; } public void setUpFlow(Integer upFlow) { this.upFlow = upFlow; } public Integer getDownFlow() { return downFlow; } public void setDownFlow(Integer downFlow) { this.downFlow = downFlow; } public Integer getUpCountFlow() { return upCountFlow; } public void setUpCountFlow(Integer upCountFlow) { this.upCountFlow = upCountFlow; } public Integer getDownCountFlow() { return downCountFlow; } public void setDownCountFlow(Integer downCountFlow) { this.downCountFlow = downCountFlow; } //序列化 @Override public void write(DataOutput out) throws IOException { out.writeInt(upFlow); out.writeInt(downFlow); out.writeInt(upCountFlow); out.writeInt(downCountFlow); } //反序列化 @Override public void readFields(DataInput in) throws IOException { this.upFlow = in.readInt(); this.downFlow = in.readInt(); this.upCountFlow = in.readInt(); this.downCountFlow = in.readInt(); } }
第二步:定义FlowMapper类
import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;public class FlowCountMapper extends Mapper<LongWritable, Text,Text,FlowBean> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //1:拆分行文本数据(拆分v1) String[] split = value.toString().split("\t"); //2:从拆分数组中得到手机号,得到K2 String phoneNum = split[1]; //3:从拆分数组中得到4个流量字段,并封装到FlowBean,得到V2 FlowBean flowBean = new FlowBean(); flowBean.setUpFlow(Integer.parseInt(split[6])); flowBean.setDownFlow(Integer.parseInt(split[7])); flowBean.setUpCountFlow(Integer.parseInt(split[8])); flowBean.setDownCountFlow(Integer.parseInt(split[9])); //4:将K2和V2写入上下文中 context.write(new Text(phoneNum), flowBean); } }
第三步:定义FlowCountReducer类
import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;public class FlowCountReducer extends Reducer<Text,FlowBean,Text,FlowBean> { @Override protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException { //1:定义四个变量,分别来存储上行包数,下行包数,上行流量总和,下行流量总和 int upFlow = 0; int downFlow = 0; int upCountFlow = 0; int downCountFlow = 0; //2:遍历集合,将集合中每一个FlowBean的四个流量字段相加 for (FlowBean flowBean : values) { upFlow += flowBean.getUpFlow(); downFlow += flowBean.getDownFlow(); upCountFlow += flowBean.getUpCountFlow(); downCountFlow += flowBean.getDownCountFlow(); } //3:K3就是原来的K2,V3就是新的FlowBean FlowBean flowBean = new FlowBean(upFlow, downFlow, upCountFlow, downCountFlow); //4:将K3和V3写入上下文中 context.write(key, flowBean); } }
第四步:程序main函数入口FlowCountRunner
import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import java.io.IOException;public class FlowCountRunner { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { //1、创建建一个job任务对象 Configuration configuration = new Configuration(); Job job = Job.getInstance(configuration, "flowcount_demo"); //2、指定job所在的jar包 job.setJarByClass(FlowCountRunner.class); //3、指定源文件的读取方式类和源文件的读取路径 job.setInputFormatClass(TextInputFormat.class); //按照行读取 //TextInputFormat.addInputPath(job, new Path("hdfs://node1:8020/input/wordcount")); //只需要指定源文件所在的目录即可 TextInputFormat.addInputPath(job, new Path("file:///E:\\input\\flowcount")); //只需要指定源文件所在的目录即可 //4、指定自定义的Mapper类和K2、V2类型 job.setMapperClass(FlowCountMapper.class); //指定Mapper类 job.setMapOutputKeyClass(Text.class); //K2类型 job.setMapOutputValueClass(FlowBean.class);//V2类型 //5、指定自定义分区类(如果有的话) //6、指定自定义分组类(如果有的话) //7、指定自定义Combiner类(如果有的话) //设置ReduceTask个数 //8、指定自定义的Reducer类和K3、V3的数据类型 job.setReducerClass(FlowCountReducer.class); //指定Reducer类 job.setOutputKeyClass(Text.class); //K3类型 job.setOutputValueClass(FlowBean.class); //V3类型 //9、指定输出方式类和结果输出路径 job.setOutputFormatClass(TextOutputFormat.class); //TextOutputFormat.setOutputPath(job, new Path("hdfs://node1:8020/output/wordcount")); //目标目录不能存在,否则报错 TextOutputFormat.setOutputPath(job, new Path("file:///E:\\output\\flowcount")); //目标目录不能存在,否则报错 //10、将job提交到yarn集群 boolean bl = job.waitForCompletion(true); //true表示可以看到任务的执行进度 //11.退出执行进程 System.exit(bl?0:1); } }
📢欢迎点赞 👍 收藏 ⭐留言 📝 如有错误敬请指正!
📢本文由 Lansonli 原创,首发于 CSDN博客🙉
📢大数据系列文章会每天更新,停下休息的时候不要忘了别人还在奔跑,希望大家抓紧时间学习,全力奔赴更美好的生活✨