private Text <span style="color: #ff0000;">word</span> = newText();
publicvoidmap(Object key, Text value, Context context)// Called once for each key/value pair in the input split
throws IOException, InterruptedException { // value stores one line of the text file (terminated by newline), while key is the offset of the first character of that line relative to the beginning of the text file
StringTokenizeritr=newStringTokenizer(value.toString()); // Split into words
throws IOException, InterruptedException { // Reducer input is the Map process output; <key,values> where key is a single word, and values are the count values for that word
job.setMapperClass(TokenizerMapper.class); // setMapperClass: Set the Mapper, defaults to IdentityMapper
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);// setReducerClass: Set the Reducer, defaults to IdentityReducer
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, newPath(otherArgs[0]));/ FileInputFormat.addInputPath: Set the input file path; can be a file, a path, or a wildcard. Can be called multiple times to add multiple paths
FileOutputFormat.setOutputPath(job, newPath(otherArgs[1]));/ FileOutputFormat.setOutputPath: Set the output file path; this path should not exist before the job runs
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
// setInputFormat: Set the map input format, defaults to TextInputFormat, key is LongWritable, value is Text
setNumMapTasks: Set the number of map tasks. This setting usually doesn't take effect; the number of map tasks depends on the number of input splits the input data can be divided into setMapRunnerClass: Set the MapRunner. Map tasks are run by the MapRunner, which defaults to MapRunnable. Its function is to read records from input splits one by one and sequentially call the Mapper's map function
setMapOutputKeyClass and setMapOutputValueClass: Set the key-value pair format of the Mapper's output setOutputKeyClass and setOutputValueClass: Set the key-value pair format of the Reducer's output
setPartitionerClass and setNumReduceTasks: Set the Partitioner, which defaults to HashPartitioner. It decides which partition a recordenters based on the hash value of the key. Each partition is processed by one reduce task, so the number of partitions equals the number of reduce tasks
setOutputFormat: Set the task's output format, which defaults to TextOutputFormat