universe in3.txt:1; why in3.txt:1; you in1.txt:2; ? in1.txt:1;
代码
package train;
import java.io.IOException; import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; /**
* 倒排索引
* @author hadoop * */
public class InvertedIndex {
//输出值:key为单词+文件地址 value为频数,均指定1 public static class Map extends Mapper
private Text valueStr = new Text(); private FileSplit fileSplit;
public void map(Object key,Text value,Context context) throws IOException, InterruptedException{
//获取输入文件信息
fileSplit = (FileSplit)context.getInputSplit(); //按空格切割
StringTokenizer st = new StringTokenizer(value.toString().trim()); while(st.hasMoreTokens()){
String filePath = fileSplit.getPath().getName().toString(); keyStr.set(st.nextToken()+\ valueStr.set(\
context.write(keyStr,valueStr); } } }
//合并频数
//输出:key为单词 value为文件地址+频数
public static class Combine extends Reducer
public void reduce(Text key,Iterable
for(Text value:values){
sum += Integer.parseInt(value.toString()); }
//拆分原有key,将单词作为新key,文件地址+频数 作为value int index = key.toString().indexOf(\
String word = key.toString().substring(0,index);
String filePath = key.toString().substring(index+1,key.toString().length()); key.set(word);
newValue.set(filePath+\ context.write(key,newValue); } }
//将每个单词对应的多个文件及频数整合到一行
public static class Reduce extends Reducer
public void reduce(Text key,Iterable
String files = \
for(Text value:values){ files += value+\
}
newValue.set(files);
context.write(key,newValue); } }
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuranc630.comtion conf = new Configuration(); conf.set(\ args = new String[]{\adoop/output/invertedIndex_out\ //检查运行命令
String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs(); if(otherArgs.length != 2){
System.err.println(\ System.exit(2); }
//配置作业名
Job job = new Job(conf,\ //配置作业各个类
job.setJarByClass(InvertedIndex.class); job.setMapperClass(Map.class); job.setCombinerClass(Combine.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
百度搜索“77cn”或“免费范文网”即可找到本站免费阅读全部范文。收藏本站方便下次阅读,免费范文网,提供经典小说综合文库Hadoop那些事儿(四) - MapReduce编程实例(基础)(5)在线全文阅读。
相关推荐: