import java.io.IOException; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; /* * This is the skeleton for CS61c project 1, Spring 2011. * * Contact Ari Rabkin or Charles Reiss with questions and comments. * * Reminder: DO NOT SHARE CODE OR ALLOW ANOTHER STUDENT TO READ YOURS. * EVEN FOR DEBUGGING. THIS MEANS YOU. * */ public class Proj1 { /** * Inputs a set of (docID, document contents) pairs. * Outputs a set of (word, (A_i,C_i) pairs) * */ public static class Map1 extends Mapper { /** Regex pattern to find words (alphanumeric + _). */ final static Pattern WORD_PATTERN = Pattern.compile("\\w+"); private String targetWord = null; /* * Setup gets called exactly once for each mapper, before map() gets called the first time. * It's a good place to do configuration or setup that can be shared across many calls to map */ @Override public void setup(Context context) { targetWord = context.getConfiguration().get("targetWord").toLowerCase(); } @Override public void map(Text docID, Text docContents, Context context) throws IOException, InterruptedException { Matcher matcher = WORD_PATTERN.matcher(docContents.toString()); //perhaps do something here, such as per-Map setup while (matcher.find()) { //perhaps do something with each word of input } //perhaps do something here, such as output } } public static class Combine1 extends Reducer { @Override public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { //something here? } } public static class Reduce1 extends Reducer { @Override public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { //do something here? //note that if you want an identity transform, you can just delete //this function and use the default for (Text value : values) { //code here? } //code here? } } public static class Map2 extends Mapper { //do something here? //note that if you want an identity transform, you can just delete //this function and use the default } public static class Reduce2 extends Reducer { int n = 0; static int N_TO_OUTPUT = 100; /* * Setup gets called exactly once for each reducer, before reduce() gets called the first time. * It's a good place to do configuration or setup that can be shared across many calls to reduce */ @Override protected void setup(Context c) { n = 0; } @Override public void reduce(DoubleWritable key, Iterable values, Context context) throws IOException, InterruptedException { for (Text value : values) { //do something with each value } } } /* * You shouldn't need to modify this function. If you think you have a good reason to, * you might want to discuss with staff. * * For grading, it would be helpful if you clearly indicate the changes you make * here and why you made them. * * The skeleton supports several options. * if you set runJob2 to false, only the first job will run and output will be * in TextFile format, instead of SequenceFile. This is intended as a debugging aid. * * If you set combiner to false, neither combiner will run. This is also * intended as a debugging aid. Turning on and off the combiner shouldn't alter * your results. Since the framework doesn't make promises about when it'll * invoke combiners, it's an error to assume anything about how many times * values will be combined. */ public static void main(String[] rawArgs) throws Exception { GenericOptionsParser parser = new GenericOptionsParser(rawArgs); Configuration conf = parser.getConfiguration(); String[] args = parser.getRemainingArgs(); boolean runJob2 = conf.getBoolean("runJob2", true); boolean combiner = conf.getBoolean("combiner", false); if(runJob2) System.out.println("running both jobs"); else System.out.println("for debugging, only running job 1"); if(combiner) System.out.println("using combiner"); else System.out.println("NOT using combiner"); Path inputPath = new Path(args[0]); Path middleOut = new Path(args[1]); Path finalOut = new Path(args[2]); FileSystem hdfs = middleOut.getFileSystem(conf); if(hdfs.exists(middleOut)) { System.err.println("can't run: " + middleOut.toUri().toString() + " already exists"); System.exit(1); } if(finalOut.getFileSystem(conf).exists(finalOut) ) { System.err.println("can't run: " + finalOut.toUri().toString() + " already exists"); System.exit(1); } { Job firstJob = new Job(conf, "wordcount+co-occur"); firstJob.setJarByClass(Map1.class); firstJob.setMapOutputKeyClass(Text.class); firstJob.setMapOutputValueClass(Text.class); firstJob.setOutputKeyClass(DoubleWritable.class); firstJob.setOutputValueClass(Text.class); firstJob.setMapperClass(Map1.class); firstJob.setReducerClass(Reduce1.class); firstJob.setNumReduceTasks(16); if(combiner) firstJob.setCombinerClass(Combine1.class); firstJob.setInputFormatClass(SequenceFileInputFormat.class); if(runJob2) firstJob.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(firstJob, inputPath); FileOutputFormat.setOutputPath(firstJob, middleOut); firstJob.waitForCompletion(true); } if(runJob2) { Job secondJob = new Job(conf, "sort"); secondJob.setJarByClass(Map1.class); secondJob.setMapOutputKeyClass(DoubleWritable.class); secondJob.setMapOutputValueClass(Text.class); secondJob.setOutputKeyClass(DoubleWritable.class); secondJob.setOutputValueClass(Text.class); secondJob.setMapperClass(Map2.class); if(combiner) secondJob.setCombinerClass(Reduce2.class); secondJob.setReducerClass(Reduce2.class); secondJob.setInputFormatClass(SequenceFileInputFormat.class); secondJob.setOutputFormatClass(TextOutputFormat.class); secondJob.setNumReduceTasks(1); FileInputFormat.addInputPath(secondJob, middleOut); FileOutputFormat.setOutputPath(secondJob, finalOut); secondJob.waitForCompletion(true); } } }