hadoop InputStriper 源码

  • 2022-10-20
  • 浏览 (198)

haddop InputStriper 代码

文件路径:/hadoop-tools/hadoop-gridmix/src/main/java/org/apache/hadoop/mapred/gridmix/InputStriper.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.mapred.gridmix;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Given a {@link #FilePool}, obtain a set of files capable of satisfying
 * a full set of splits, then iterate over each source to fill the request.
 */
class InputStriper {
  public static final Logger LOG = LoggerFactory.getLogger(InputStriper.class);
  int idx;
  long currentStart;
  FileStatus current;
  final List<FileStatus> files = new ArrayList<FileStatus>();
  final Configuration conf = new Configuration();

  /**
   * @param inputDir Pool from which files are requested.
   * @param mapBytes Sum of all expected split requests.
   */
  InputStriper(FilePool inputDir, long mapBytes)
      throws IOException {
    final long inputBytes = inputDir.getInputFiles(mapBytes, files);
    if (mapBytes > inputBytes) {
      LOG.warn("Using " + inputBytes + "/" + mapBytes + " bytes");
    }
    if (files.isEmpty() && mapBytes > 0) {
      throw new IOException("Failed to satisfy request for " + mapBytes);
    }
    current = files.isEmpty() ? null : files.get(0);
  }

  /**
   * @param inputDir Pool used to resolve block locations.
   * @param bytes Target byte count
   * @param nLocs Number of block locations per split.
   * @return A set of files satisfying the byte count, with locations weighted
   *         to the dominating proportion of input bytes.
   */
  CombineFileSplit splitFor(FilePool inputDir, long bytes, int nLocs)
      throws IOException {
    final ArrayList<Path> paths = new ArrayList<Path>();
    final ArrayList<Long> start = new ArrayList<Long>();
    final ArrayList<Long> length = new ArrayList<Long>();
    final HashMap<String,Double> sb = new HashMap<String,Double>();
    do {
      paths.add(current.getPath());
      start.add(currentStart);
      final long fromFile = Math.min(bytes, current.getLen() - currentStart);
      length.add(fromFile);
      for (BlockLocation loc :
          inputDir.locationsFor(current, currentStart, fromFile)) {
        final double tedium = loc.getLength() / (1.0 * bytes);
        for (String l : loc.getHosts()) {
          Double j = sb.get(l);
          if (null == j) {
            sb.put(l, tedium);
          } else {
            sb.put(l, j.doubleValue() + tedium);
          }
        }
      }
      currentStart += fromFile;
      bytes -= fromFile;
      // Switch to a new file if
      //  - the current file is uncompressed and completely used
      //  - the current file is compressed
      
      CompressionCodecFactory compressionCodecs = 
        new CompressionCodecFactory(conf);
      CompressionCodec codec = compressionCodecs.getCodec(current.getPath());
      if (current.getLen() - currentStart == 0
          || codec != null) {
        current = files.get(++idx % files.size());
        currentStart = 0;
      }
    } while (bytes > 0);
    final ArrayList<Entry<String,Double>> sort =
      new ArrayList<Entry<String,Double>>(sb.entrySet());
    Collections.sort(sort, hostRank);
    final String[] hosts = new String[Math.min(nLocs, sort.size())];
    for (int i = 0; i < nLocs && i < sort.size(); ++i) {
      hosts[i] = sort.get(i).getKey();
    }
    return new CombineFileSplit(paths.toArray(new Path[0]),
        toLongArray(start), toLongArray(length), hosts);
  }

  private long[] toLongArray(final ArrayList<Long> sigh) {
    final long[] ret = new long[sigh.size()];
    for (int i = 0; i < ret.length; ++i) {
      ret[i] = sigh.get(i);
    }
    return ret;
  }

  static final Comparator<Entry<String,Double>> hostRank =
    new Comparator<Entry<String,Double>>() {
      public int compare(Entry<String,Double> a, Entry<String,Double> b) {
        return Double.compare(a.getValue(), b.getValue());
      }
    };
}

相关信息

hadoop 源码目录

相关文章

hadoop AvgRecordFactory 源码

hadoop ClusterSummarizer 源码

hadoop CompressionEmulationUtil 源码

hadoop DistributedCacheEmulator 源码

hadoop EchoUserResolver 源码

hadoop ExecutionSummarizer 源码

hadoop FilePool 源码

hadoop FileQueue 源码

hadoop GenerateData 源码

hadoop GenerateDistCacheData 源码

0  赞