我们可以在mapreduce代码中的mapper类的setup方法中放置一些计算任务吗

Posted 2023-03-23

技术标签:

【中文标题】我们可以在mapreduce代码中的mapper类的setup方法中放置一些计算任务吗【英文标题】：Can we put some computation task inside setup method of mapper class in mapreduce code 【发布时间】：2015-10-10 06:10:27 【问题描述】：

我在映射器类中使用了 setup() 方法。还有一个用户定义的方法 aprioriGenK() 在 mapper 类中定义并在 map() 方法中调用。

现在的问题是：无论我知道什么 map 方法都会为每一行输入调用。假设有 100 行，那么这个方法调用了 100 次。 map 方法每次相应地调用 aprioriGenK 方法。但是每次调用map方法时不需要在map方法中调用aprioriGenK。即 aprioriGenK 方法的结果对于 map 方法的所有输入行都是通用的。 aprioriGenK 方法非常占用 CPU，因此在一次又一次调用时会增加计算时间。我们能否设法以某种方式一次调用 aprioriGenK 并每次在 map 方法中使用它。我试图将 aprioriGen 保留在 setup 方法中，以便它只能被调用一次，但令人惊讶的是它在很大程度上减慢了执行速度。

这是我的代码：

import dataStructuresV2.ItemsetTrie;

public class AprioriTrieMapper extends Mapper<Object, Text, Text, IntWritable>

    public static enum State
    
        UPDATED
    

    private final static IntWritable one = new IntWritable(1);
    private Text itemset = new Text();

    private Configuration conf;
    private StringTokenizer fitemset;   // store one line of previous output file of frequent itemsets
    private ItemsetTrie trieLk_1 = null;    // prefix tree to store candidate (k-1)-itemsets of previous pass
    private int k;                      // itemsetSize or iteration no.
//  private ItemsetTrie trieCk = null;          // prefix tree to store candidate k-itemsets

    public void setup(Context context) throws IOException, InterruptedException
    
        conf = context.getConfiguration();
        URI[] previousOutputURIs = Job.getInstance(conf).getCacheFiles();
        k = conf.getInt("k", k);
        trieLk_1 = new ItemsetTrie();

        for (URI previousOutputURI : previousOutputURIs)
        
            Path previousOutputPath = new Path(previousOutputURI.getPath());
            String previousOutputFileName = previousOutputPath.getName().toString();
            filterItemset(previousOutputFileName, trieLk_1);
        
    //  trieCk = aprioriGenK(trieLk_1, k-1);    // candidate generation from prefix tree of size k-1
    // end method setup

    //trim count from each line and store only itemset
    private void filterItemset(String fileName, ItemsetTrie trieLk_1)
    
        try 
        
          BufferedReader fis = new BufferedReader(new FileReader(fileName));
          String line = null;
        //  trieLk_1 = new ItemsetTrie();

          while ((line = fis.readLine()) != null)
          
              fitemset = new StringTokenizer(line, "\t");
              trieLk_1.insertCandidateItemset(fitemset.nextToken());
          
          fis.close();
        
        catch (IOException ioe)
        
          System.err.println("Caught exception while parsing the cached file '" + fileName + "' : " + StringUtils.stringifyException(ioe));
        
    // end method filterItemset

    public void map(Object key, Text value, Context context) throws IOException, InterruptedException 
    
        StringTokenizer items = new StringTokenizer(value.toString().toLowerCase()," \t\n\r\f,.:;?![]'"); // tokenize transaction
        LinkedList <String>itemlist = new LinkedList<String>(); // store the tokens or itemse of transaction

        LinkedList <String>listCt;      // list of subset of transaction that are candidates
    //  Map <String, Integer>mapCt;     // list of subset of transaction that are candidates with support count
        ItemsetTrie trieCk = null;          // prefix tree to store candidate k-itemsets
        StringTokenizer candidate;

        trieCk = aprioriGenK(trieLk_1, k-1);        // candidate generation from prefix tree of size k-1

        if(trieCk.numberOfCandidate() > 0)
            context.getCounter(State.UPDATED).increment(1);     // increment counter

        // optimization: if transaction size is less than candidate size then it should not be checked
        if(items.countTokens() >= k)
        
            while (items.hasMoreTokens())               // add tokens of transaction to list
                itemlist.add(items.nextToken());

            // we use either simple linkedlist listCt or map mapCt
            listCt = trieCk.candidateSupportCount1(itemlist, k);
            for(String listCtMember : listCt)   // generate (key, value) pair. work on listCt
            
                candidate = new StringTokenizer(listCtMember, "\n");
                if(candidate.hasMoreTokens())
                
                    itemset.set(candidate.nextToken()); context.write(itemset, one);
                
            
         // end if
     // end method map

    // generating candidate prefix tree of size k using prefix tree of size k-1
    public ItemsetTrie aprioriGenK(ItemsetTrie trieLk_1, int itemsetSize)   // itemsetSize of trie Lk_1
    
        ItemsetTrie candidateTree = new ItemsetTrie();      // local prefix tree store candidates k-itemsets
        trieLk_1.candidateGenK(candidateTree, itemsetSize); // new candidate prefix tree obtained
        return candidateTree;                               // return prefix tree of size k
     // end method aprioriGenK
 //end class TrieBasedSPCItemsetMapper

这是我的驱动程序类：

公共类 AprioriTrie 私有静态 Logger log = Logger.getLogger(AprioriTrie.class);

public static void main(String[] args) throws Exception

    Configuration conf = new Configuration();

//  String minsup = "1";
    String minsup = null;
    List<String> otherArgs = new ArrayList<String>();
    for (int i=0; i < args.length; ++i)
    
        if ("-minsup".equals(args[i]))
            minsup = args[++i];
        else
            otherArgs.add(args[i]);
    

    conf.set("min_sup", minsup);

    log.info("Started counting 1-itemset ....................");
    Date date; long startTime, endTime;                         // for recording start and end time of job
    date = new Date(); startTime = date.getTime();              // starting timer

    // Phase-1
    Job job = Job.getInstance(conf, "AprioriTrie: Iteration-1");
    job.setJarByClass(aprioriBasedAlgorithms.AprioriTrie.class);

    job.setMapperClass(OneItemsetMapper.class);
    job.setCombinerClass(OneItemsetCombiner.class);
    job.setReducerClass(OneItemsetReducer.class);

//  job.setOutputKeyClass(Text.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(NLineInputFormat.class);
    NLineInputFormat.setNumLinesPerSplit(job, 10000);   // set specific no. of line of records

//  Path inputPath = new Path("hdfs://hadoopmaster:9000/user/hduser/sample-transactions1/");
    Path inputPath = new Path(otherArgs.get(0));
//  Path outputPath = new Path("hdfs://hadoopmaster:9000/user/hduser/AprioriTrie/fis-1");
    Path outputPath = new Path(otherArgs.get(1)+"/fis-1");

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);            

    if(job.waitForCompletion(true))
        log.info("SUCCESSFULLY- Completed Frequent 1-itemsets Geneation.");
    else
        log.info("ERROR- Completed Frequent 1-itemsets Geneation.");

    // Phase-k >=2
    int iteration = 1; long counter;
    do
    
        Configuration conf2 = new Configuration();
        conf2.set("min_sup", minsup);
        conf2.setInt("k", iteration+1);

        log.info("Started counting "+(iteration+1)+"-itemsets ..................");
        Job job2 = Job.getInstance(conf2, "AprioriTrie: Iteration-"+(iteration+1));
        job2.setJarByClass(aprioriBasedAlgorithms.AprioriTrie.class);

        job2.setMapperClass(AprioriTrieMapper.class);
        job2.setCombinerClass(ItemsetCombiner.class);
        job2.setReducerClass(ItemsetReducer.class);

        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(IntWritable.class);

        job2.setNumReduceTasks(4); // break the output in 3 files

        job2.setInputFormatClass(NLineInputFormat.class);
        NLineInputFormat.setNumLinesPerSplit(job2, 10000);

        FileSystem fs = FileSystem.get(new URI("hdfs://hadoopmaster:9000"), conf2);
    //  FileStatus[] status = fs.listStatus(new Path("hdfs://hadoopmaster:9000/user/hduser/AprioriTrie/fis-"+iteration+"/"));
        FileStatus[] status = fs.listStatus(new Path(otherArgs.get(1)+"/fis-"+iteration));
        for (int i=0;i<status.length;i++)
        
            job2.addCacheFile(status[i].getPath().toUri()); // add all files inside output fis
            //job2.addFileToClassPath(status[i].getPath());
        

    //  input is same for these job
    //  outputPath = new Path("hdfs://hadoopmaster:9000/user/hduser/AprioriTrie/fis-"+(iteration+1));
        outputPath = new Path(otherArgs.get(1)+"/fis-"+(iteration+1));

        FileInputFormat.setInputPaths(job2, inputPath);
        FileOutputFormat.setOutputPath(job2, outputPath);

        if(job2.waitForCompletion(true))
            log.info("SUCCESSFULLY- Completed Frequent "+(iteration+1)+"-itemsets Generation.");
        else
            log.info("ERROR- Completed Frequent "+(iteration+1)+"-itemsets Generation.");

        iteration++;
        counter = job2.getCounters().findCounter(AprioriTrieMapper.State.UPDATED).getValue();
     while (counter > 0);

    date = new Date(); endTime = date.getTime();                    //end timer
    log.info("Total Time (in milliseconds) = "+ (endTime-startTime));
    log.info("Total Time (in seconds) = "+ (endTime-startTime)*0.001F);

【问题讨论】：

【参考方案1】：

您可以在 setup 调用之后将该函数调用添加到映射器的 run 方法中。这将确保每个映射器只调用一次您的方法。

public class Mymapper extends Mapper<LongWritable,Text,Text,IntWritable> 

    public void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException
    
               //do something

    
    public void myfunc(String parm)
    
        System.out.println("parm="+parm);
    
    public void run(Context context) throws IOException, InterruptedException 
    
        setup(context);
        myfunc("hello");
        while(context.nextKeyValue())
        
            map(context.getCurrentKey(), context.getCurrentValue(), context);

【讨论】：

我还没有习惯使用 run 方法，也不知道如何在驱动程序类中使用它。我在修改后的问题中添加了我的驱动程序类。我也需要帮助才能在驱动程序类中调用。 @SudhakarSingh 您不需要在驱动程序类中添加任何内容。只需将 myfunc() 替换为您的函数名称，将其添加到您的映射器类中，以便在您的 setup 方法之后和调用从 Inputformat 读取之前调用它。【参考方案2】：

我对映射器类进行了更改，但生成的代码非常慢，而且似乎多次调用 aprioriGenK()。

这是我修改后的代码。

public class AprioriTrieMapper extends Mapper<Object, Text, Text, IntWritable>

public static enum State

    UPDATED


private final static IntWritable one = new IntWritable(1);
private Text itemset = new Text();

private Configuration conf;
private StringTokenizer fitemset;   // store one line of previous output file of frequent itemsets
private ItemsetTrie trieLk_1 = null;    // prefix tree to store candidate (k-1)-itemsets of previous pass
private int k;                      // itemsetSize or iteration no.
private ItemsetTrie trieCk = null;          // prefix tree to store candidate k-itemsets

public void setup(Context context) throws IOException, InterruptedException

    conf = context.getConfiguration();
    URI[] previousOutputURIs = Job.getInstance(conf).getCacheFiles();
    k = conf.getInt("k", k);
    trieLk_1 = new ItemsetTrie();

    for (URI previousOutputURI : previousOutputURIs)
    
        Path previousOutputPath = new Path(previousOutputURI.getPath());
        String previousOutputFileName = previousOutputPath.getName().toString();
        filterItemset(previousOutputFileName, trieLk_1);
    
//  trieCk = aprioriGenK(trieLk_1, k-1);    // candidate generation from prefix tree of size k-1
// end method setup

//trim count from each line and store only itemset
private void filterItemset(String fileName, ItemsetTrie trieLk_1)

    try 
    
      BufferedReader fis = new BufferedReader(new FileReader(fileName));
      String line = null;
    //  trieLk_1 = new ItemsetTrie();

      while ((line = fis.readLine()) != null)
      
          fitemset = new StringTokenizer(line, "\t");
          trieLk_1.insertCandidateItemset(fitemset.nextToken());
      
      fis.close();
    
    catch (IOException ioe)
    
      System.err.println("Caught exception while parsing the cached file '" + fileName + "' : " + StringUtils.stringifyException(ioe));
    
// end method filterItemset

//run method
public void run(Context context) throws IOException, InterruptedException

    setup(context);
    trieCk = aprioriGenK(trieLk_1, k-1);    // candidate generation from prefix tree of size k-1

    if(trieCk.numberOfCandidate() > 0)
        context.getCounter(State.UPDATED).increment(1);     // increment counter

    while(context.nextKeyValue())
    
        map(context.getCurrentKey(), context.getCurrentValue(), context);
    
// end method run

public void map(Object key, Text value, Context context) throws IOException, InterruptedException 

    StringTokenizer items = new StringTokenizer(value.toString().toLowerCase()," \t\n\r\f,.:;?![]'"); // tokenize transaction
    LinkedList <String>itemlist = new LinkedList<String>(); // store the tokens or itemse of transaction

    LinkedList <String>listCt;      // list of subset of transaction that are candidates
//  Map <String, Integer>mapCt;     // list of subset of transaction that are candidates with support count
//  ItemsetTrie trieCk = null;          // prefix tree to store candidate k-itemsets
    StringTokenizer candidate;

//  if(context.getCounter(State.UPDATED).getValue() == 0)
//  
//      trieCk = aprioriGenK(trieLk_1, k-1);    // candidate generation from prefix tree of size k-1

    //  if(trieCk.numberOfCandidate() > 0)
        //  context.getCounter(State.UPDATED).increment(1);     // increment counter
//  

    // optimization: if transaction size is less than candidate size then it should not be checked
    if(items.countTokens() >= k)
    
        while (items.hasMoreTokens())               // add tokens of transaction to list
            itemlist.add(items.nextToken());

        // we use either simple linkedlist listCt or map mapCt
        listCt = trieCk.candidateSupportCount1(itemlist, k);
        for(String listCtMember : listCt)   // generate (key, value) pair. work on listCt
        
            candidate = new StringTokenizer(listCtMember, "\n");
            if(candidate.hasMoreTokens())
            
                itemset.set(candidate.nextToken()); context.write(itemset, one);
            
        
     // end if
 // end method map

// generating candidate prefix tree of size k using prefix tree of size k-1
public ItemsetTrie aprioriGenK(ItemsetTrie trieLk_1, int itemsetSize)   // itemsetSize of trie Lk_1

    ItemsetTrie candidateTree = new ItemsetTrie();      // local prefix tree store candidates k-itemsets
    trieLk_1.candidateGenK(candidateTree, itemsetSize); // new candidate prefix tree obtained
    return candidateTree;                               // return prefix tree of size k
 // end method aprioriGenK
 //end class TrieBasedSPCItemsetMapper

【讨论】：

以上是关于我们可以在mapreduce代码中的mapper类的setup方法中放置一些计算任务吗的主要内容，如果未能解决你的问题，请参考以下文章