cleanup() 方法如何工作?
How does cleanup() method work?
我目前是 Hadoop 的新手。所以我在 MapReduce 中解决了一段代码,它找到了 "parts of a country with most 'Data Engineer' jobs for each year"(例如,如果数据格式为 (Year,Region,Count(Jobs)) 是 "2016,'XYZ',35" 和 "2016,'ABC',25" 和 "2015,'sdf',14" ,答案是 "2016,'XYZ',35" 和 "2015,'sdf',14"), 但我无法理解reducer中的部分如下:-
if (Top5DataEngineer.size() > 1)
Top5DataEngineer.remove(Top5DataEngineer.firstKey());
}//Ignore this bracket for the time being.
protected void cleanup(Context context) throws IOException,
InterruptedException {
for (Text t : Top5DataEngineer.descendingMap().values())
context.write(NullWritable.get(), t);
}
这是完整代码:-
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import java.util.TreeMap;
import org.apache.hadoop.mapreduce.Reducer;
public class Q_002a {
public static class Q_002a_Mapper extends
Mapper<LongWritable, Text, Text, LongWritable> {
LongWritable one = new LongWritable(1);
public void map(LongWritable key, Text values, Context context)
throws IOException, InterruptedException {
try {
if (key.get() > 0)
{
String[] token = values.toString().split("\t");
if (token[4].equals("DATA ENGINEER")) {
Text answer = new Text(token[8] + "\t" + token[7]);
context.write(answer, one);
}
}
} catch (ArrayIndexOutOfBoundsException e) {
System.out.println(e.getMessage());
} catch (ArithmeticException e1) {
System.out.println(e1.getMessage());
}
}
}
public static class Q_002a_Partitioner extends Partitioner<Text, LongWritable> {
@Override
public int getPartition(Text key, LongWritable value, int numReduceTasks) {
String[] str = key.toString().split("\t");
if (str[1].equals("2011"))
return 0;
if (str[1].equals("2012"))
return 1;
if (str[1].equals("2013"))
return 2;
if (str[1].equals("2014"))
return 3;
if (str[1].equals("2015"))
return 4;
if (str[1].equals("2016"))
return 5;
else
return 6;
}
}
public static class Q_002a_Reducer extends
Reducer<Text, LongWritable, NullWritable, Text> {
private TreeMap<LongWritable, Text> Top5DataEngineer = new TreeMap<LongWritable, Text>();
long sum = 0;
public void reduce(Text key, Iterable<LongWritable> values,
Context context) throws IOException, InterruptedException {
sum = 0;
for (LongWritable val : values) {
sum += val.get();
}
Top5DataEngineer.put(new LongWritable(sum), new Text(key + ","
+ sum));
if (Top5DataEngineer.size() > 1)
Top5DataEngineer.remove(Top5DataEngineer.firstKey());
}
protected void cleanup(Context context) throws IOException,
InterruptedException {
for (Text t : Top5DataEngineer.descendingMap().values())
context.write(NullWritable.get(), t);
}
}
public static void main(String args[]) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Top 5 Data Engineer in a worksite");
job.setJarByClass(Q_002a.class);
job.setMapperClass(Q_002a_Mapper.class);
job.setPartitionerClass(Q_002a_Partitioner.class);
job.setReducerClass(Q_002a_Reducer.class);
job.setNumReduceTasks(6);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
这是我得到的输出:-
编辑:- 我在 reduce() 方法中尝试了 运行 cleanup() 方法中的代码,但它没有按预期工作。只有在 cleanup() 方法中才 运行 没问题。对此有任何帮助,我们将不胜感激。
cleanup()
方法将在处理阶段完成时调用。而且它只会被调用一次。
在您的示例中,reduce()
方法是 "searching" 年分区中按城市划分的数据工程师职位的最大总数。 Top5DataEngineer
TreeMap 按排序(升序)顺序存储键,并且在每次迭代中,如果它有多个键,它只会删除第一个键(较小的键)。换句话说,在处理 Iterable<LongWritable>
值后,您将在每个 'years' 分区中得到一个工作岗位数量最多的城市。
当 reducer 阶段完成时,cleanup()
方法简单地写入每个处理分区的结果(single/biggest kv-pair in Top5DataEngineer
map)。
cleanup()
方法将为每个 'years' 分区调用一次。
希望对您有所帮助。
我目前是 Hadoop 的新手。所以我在 MapReduce 中解决了一段代码,它找到了 "parts of a country with most 'Data Engineer' jobs for each year"(例如,如果数据格式为 (Year,Region,Count(Jobs)) 是 "2016,'XYZ',35" 和 "2016,'ABC',25" 和 "2015,'sdf',14" ,答案是 "2016,'XYZ',35" 和 "2015,'sdf',14"), 但我无法理解reducer中的部分如下:-
if (Top5DataEngineer.size() > 1)
Top5DataEngineer.remove(Top5DataEngineer.firstKey());
}//Ignore this bracket for the time being.
protected void cleanup(Context context) throws IOException,
InterruptedException {
for (Text t : Top5DataEngineer.descendingMap().values())
context.write(NullWritable.get(), t);
}
这是完整代码:-
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import java.util.TreeMap;
import org.apache.hadoop.mapreduce.Reducer;
public class Q_002a {
public static class Q_002a_Mapper extends
Mapper<LongWritable, Text, Text, LongWritable> {
LongWritable one = new LongWritable(1);
public void map(LongWritable key, Text values, Context context)
throws IOException, InterruptedException {
try {
if (key.get() > 0)
{
String[] token = values.toString().split("\t");
if (token[4].equals("DATA ENGINEER")) {
Text answer = new Text(token[8] + "\t" + token[7]);
context.write(answer, one);
}
}
} catch (ArrayIndexOutOfBoundsException e) {
System.out.println(e.getMessage());
} catch (ArithmeticException e1) {
System.out.println(e1.getMessage());
}
}
}
public static class Q_002a_Partitioner extends Partitioner<Text, LongWritable> {
@Override
public int getPartition(Text key, LongWritable value, int numReduceTasks) {
String[] str = key.toString().split("\t");
if (str[1].equals("2011"))
return 0;
if (str[1].equals("2012"))
return 1;
if (str[1].equals("2013"))
return 2;
if (str[1].equals("2014"))
return 3;
if (str[1].equals("2015"))
return 4;
if (str[1].equals("2016"))
return 5;
else
return 6;
}
}
public static class Q_002a_Reducer extends
Reducer<Text, LongWritable, NullWritable, Text> {
private TreeMap<LongWritable, Text> Top5DataEngineer = new TreeMap<LongWritable, Text>();
long sum = 0;
public void reduce(Text key, Iterable<LongWritable> values,
Context context) throws IOException, InterruptedException {
sum = 0;
for (LongWritable val : values) {
sum += val.get();
}
Top5DataEngineer.put(new LongWritable(sum), new Text(key + ","
+ sum));
if (Top5DataEngineer.size() > 1)
Top5DataEngineer.remove(Top5DataEngineer.firstKey());
}
protected void cleanup(Context context) throws IOException,
InterruptedException {
for (Text t : Top5DataEngineer.descendingMap().values())
context.write(NullWritable.get(), t);
}
}
public static void main(String args[]) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Top 5 Data Engineer in a worksite");
job.setJarByClass(Q_002a.class);
job.setMapperClass(Q_002a_Mapper.class);
job.setPartitionerClass(Q_002a_Partitioner.class);
job.setReducerClass(Q_002a_Reducer.class);
job.setNumReduceTasks(6);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
这是我得到的输出:-
编辑:- 我在 reduce() 方法中尝试了 运行 cleanup() 方法中的代码,但它没有按预期工作。只有在 cleanup() 方法中才 运行 没问题。对此有任何帮助,我们将不胜感激。
cleanup()
方法将在处理阶段完成时调用。而且它只会被调用一次。
在您的示例中,reduce()
方法是 "searching" 年分区中按城市划分的数据工程师职位的最大总数。 Top5DataEngineer
TreeMap 按排序(升序)顺序存储键,并且在每次迭代中,如果它有多个键,它只会删除第一个键(较小的键)。换句话说,在处理 Iterable<LongWritable>
值后,您将在每个 'years' 分区中得到一个工作岗位数量最多的城市。
当 reducer 阶段完成时,cleanup()
方法简单地写入每个处理分区的结果(single/biggest kv-pair in Top5DataEngineer
map)。
cleanup()
方法将为每个 'years' 分区调用一次。
希望对您有所帮助。