hadoop GpuDiscoverer 源码

  • 2022-10-20
  • 浏览 (202)

haddop GpuDiscoverer 代码

文件路径:/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;

import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Lists;
import org.apache.hadoop.util.Sets;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;

import org.apache.hadoop.classification.VisibleForTesting;
import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableSet;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;


@InterfaceAudience.Private
@InterfaceStability.Unstable
public class GpuDiscoverer extends Configured {
  public static final Logger LOG = LoggerFactory.getLogger(
      GpuDiscoverer.class);
  @VisibleForTesting
  static final String DEFAULT_BINARY_NAME = "nvidia-smi";

  // When executable path not set, try to search default dirs
  // By default search /usr/bin, /bin, and /usr/local/nvidia/bin (when
  // launched by nvidia-docker.
  private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
      "/usr/bin", "/bin", "/usr/local/nvidia/bin");

  private static final int MAX_REPEATED_ERROR_ALLOWED = 10;

  private NvidiaBinaryHelper nvidiaBinaryHelper;
  private String pathOfGpuBinary = null;
  private Map<String, String> environment = new HashMap<>();

  private int numOfErrorExecutionSinceLastSucceed = 0;
  private GpuDeviceInformation lastDiscoveredGpuInformation = null;

  private List<GpuDevice> gpuDevicesFromUser;

  private void validateConfOrThrowException() throws YarnException {
    if (getConf() == null) {
      throw new YarnException("Please initialize (call initialize) before use "
          + GpuDiscoverer.class.getSimpleName());
    }
  }

  private String getErrorMessageOfScriptExecution(String msg) {
    return getFailedToExecuteScriptMessage() +
        "! Exception message: " + msg;
  }

  private String getErrorMessageOfScriptExecutionThresholdReached() {
    return getFailedToExecuteScriptMessage() + " for " +
        MAX_REPEATED_ERROR_ALLOWED + " times, " +
        "skipping following executions!";
  }

  private String getFailedToExecuteScriptMessage() {
    return "Failed to execute " +
        GpuDeviceInformationParser.GPU_SCRIPT_REFERENCE +
        " (" + pathOfGpuBinary + ")";
  }

  private String getFailedToParseErrorMessage(String msg) {
    return "Failed to parse XML output of " +
        GpuDeviceInformationParser.GPU_SCRIPT_REFERENCE
        + "( " + pathOfGpuBinary + ")" + msg;
  }

  /**
   * Get GPU device information from system.
   * This need to be called after initialize.
   *
   * Please note that this only works on *NIX platform, so external caller
   * need to make sure this.
   *
   * @return GpuDeviceInformation
   * @throws YarnException when any error happens
   */
  public synchronized GpuDeviceInformation getGpuDeviceInformation()
      throws YarnException {
    if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
      String msg = getErrorMessageOfScriptExecutionThresholdReached();
      LOG.error(msg);
      throw new YarnException(msg);
    }

    try {
      lastDiscoveredGpuInformation =
          nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
    } catch (IOException e) {
      numOfErrorExecutionSinceLastSucceed++;
      String msg = getErrorMessageOfScriptExecution(e.getMessage());
      LOG.debug(msg);
      throw new YarnException(msg, e);
    } catch (YarnException e) {
      numOfErrorExecutionSinceLastSucceed++;
      String msg = getFailedToParseErrorMessage(e.getMessage());
      LOG.debug(msg, e);
      throw e;
    }

    return lastDiscoveredGpuInformation;
  }

  boolean isAutoDiscoveryEnabled() {
    String allowedDevicesStr = getConf().get(
        YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
        YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
    return allowedDevicesStr.equals(
        YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
  }

  /**
   * Get list of GPU devices usable by YARN.
   *
   * @return List of GPU devices
   */
  public synchronized List<GpuDevice> getGpusUsableByYarn()
      throws YarnException {
    validateConfOrThrowException();

    if (isAutoDiscoveryEnabled()) {
      return parseGpuDevicesFromAutoDiscoveredGpuInfo();
    } else {
      if (gpuDevicesFromUser == null) {
        gpuDevicesFromUser = parseGpuDevicesFromUserDefinedValues();
      }
      return gpuDevicesFromUser;
    }
  }

  private List<GpuDevice> parseGpuDevicesFromAutoDiscoveredGpuInfo()
          throws YarnException {
    if (lastDiscoveredGpuInformation == null) {
      String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to "
          + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES
          + ", however automatically discovering "
          + "GPU information failed, please check NodeManager log for more"
          + " details, as an alternative, admin can specify "
          + YarnConfiguration.NM_GPU_ALLOWED_DEVICES
          + " manually to enable GPU isolation.";
      LOG.error(msg);
      throw new YarnException(msg);
    }

    List<GpuDevice> gpuDevices = new ArrayList<>();
    if (lastDiscoveredGpuInformation.getGpus() != null) {
      int numberOfGpus = lastDiscoveredGpuInformation.getGpus().size();
      LOG.debug("Found {} GPU devices", numberOfGpus);
      for (int i = 0; i < numberOfGpus; i++) {
        List<PerGpuDeviceInformation> gpuInfos =
            lastDiscoveredGpuInformation.getGpus();
        gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber()));
      }
    }
    return gpuDevices;
  }

  /**
   * @return List of GpuDevices
   * @throws YarnException when a GPU device is defined as a duplicate.
   * The first duplicate GPU device will be added to the exception message.
   */
  private List<GpuDevice> parseGpuDevicesFromUserDefinedValues()
      throws YarnException {
    String devices = getConf().get(
        YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
        YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);

    if (devices.trim().isEmpty()) {
      throw GpuDeviceSpecificationException.createWithEmptyValueSpecified();
    }
    List<GpuDevice> gpuDevices = Lists.newArrayList();
    for (String device : devices.split(",")) {
      if (device.trim().length() > 0) {
        String[] splitByColon = device.trim().split(":");
        if (splitByColon.length != 2) {
          throwIfNecessary(GpuDeviceSpecificationException
              .createWithWrongValueSpecified(device, devices), getConf());
          LOG.warn("Wrong GPU specification string {}, ignored", device);
        }

        GpuDevice gpuDevice;
        try {
          gpuDevice = parseGpuDevice(splitByColon);
        } catch (NumberFormatException e) {
          throwIfNecessary(GpuDeviceSpecificationException
              .createWithWrongValueSpecified(device, devices, e), getConf());
          LOG.warn("Cannot parse GPU device numbers: {}", device);
          continue;
        }

        if (!gpuDevices.contains(gpuDevice)) {
          gpuDevices.add(gpuDevice);
        } else {
          throwIfNecessary(GpuDeviceSpecificationException
              .createWithDuplicateValueSpecified(device, devices), getConf());
          LOG.warn("CPU device is duplicated: {}", device);
        }
      }
    }
    LOG.info("Allowed GPU devices:" + gpuDevices);

    return gpuDevices;
  }

  private GpuDevice parseGpuDevice(String[] splitByColon) {
    int index = Integer.parseInt(splitByColon[0]);
    int minorNumber = Integer.parseInt(splitByColon[1]);
    return new GpuDevice(index, minorNumber);
  }

  public synchronized void initialize(Configuration config,
      NvidiaBinaryHelper nvidiaHelper) throws YarnException {
    setConf(config);
    this.nvidiaBinaryHelper = nvidiaHelper;
    if (isAutoDiscoveryEnabled()) {
      numOfErrorExecutionSinceLastSucceed = 0;
      lookUpAutoDiscoveryBinary(config);

      // Try to discover GPU information once and print
      try {
        LOG.info("Trying to discover GPU information ...");
        GpuDeviceInformation info = getGpuDeviceInformation();
        LOG.info("Discovered GPU information: " + info.toString());
      } catch (YarnException e) {
        String msg =
                "Failed to discover GPU information from system, exception message:"
                        + e.getMessage() + " continue...";
        LOG.warn(msg);
      }
    }
  }

  private void lookUpAutoDiscoveryBinary(Configuration config)
      throws YarnException {
    String configuredBinaryPath = config.get(
        YarnConfiguration.NM_GPU_PATH_TO_EXEC, DEFAULT_BINARY_NAME);
    if (configuredBinaryPath.isEmpty()) {
      configuredBinaryPath = DEFAULT_BINARY_NAME;
    }

    File binaryPath;
    File configuredBinaryFile = new File(configuredBinaryPath);
    if (!configuredBinaryFile.exists()) {
      binaryPath = lookupBinaryInDefaultDirs();
    } else if (configuredBinaryFile.isDirectory()) {
      binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile);
    } else {
      binaryPath = configuredBinaryFile;
      // If path exists but file name is incorrect don't execute the file
      String fileName = binaryPath.getName();
      if (!DEFAULT_BINARY_NAME.equals(fileName)) {
        String msg = String.format("Please check the configuration value of"
             +" %s. It should point to an %s binary, which is now %s",
             YarnConfiguration.NM_GPU_PATH_TO_EXEC,
             DEFAULT_BINARY_NAME,
             fileName);
        throwIfNecessary(new YarnException(msg), config);
        LOG.warn(msg);
      }
    }

    pathOfGpuBinary = binaryPath.getAbsolutePath();
  }

  private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile)
      throws YarnException {
    File binaryPath = new File(configuredBinaryFile, DEFAULT_BINARY_NAME);
    if (!binaryPath.exists()) {
      throw new YarnException("Failed to find GPU discovery executable, " +
          "please double check "+ YarnConfiguration.NM_GPU_PATH_TO_EXEC +
          " setting. The setting points to a directory but " +
          "no file found in the directory with name:" + DEFAULT_BINARY_NAME);
    } else {
      LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME
          + " under the directory, updated path-to-executable:"
          + binaryPath.getAbsolutePath());
    }
    return binaryPath;
  }

  private File lookupBinaryInDefaultDirs() throws YarnException {
    final File lookedUpBinary = lookupBinaryInDefaultDirsInternal();
    if (lookedUpBinary == null) {
      throw new YarnException("Failed to find GPU discovery executable, " +
          "please double check " + YarnConfiguration.NM_GPU_PATH_TO_EXEC +
          " setting. Also tried to find the executable " +
          "in the default directories: " + DEFAULT_BINARY_SEARCH_DIRS);
    }
    return lookedUpBinary;
  }

  private File lookupBinaryInDefaultDirsInternal() {
    Set<String> triedBinaryPaths = Sets.newHashSet();
    for (String dir : DEFAULT_BINARY_SEARCH_DIRS) {
      File binaryPath = new File(dir, DEFAULT_BINARY_NAME);
      if (binaryPath.exists()) {
        return binaryPath;
      } else {
        triedBinaryPaths.add(binaryPath.getAbsolutePath());
      }
    }
    LOG.warn("Failed to locate GPU device discovery binary, tried paths: "
        + triedBinaryPaths + "! Please double check the value of config "
        + YarnConfiguration.NM_GPU_PATH_TO_EXEC +
        ". Using default binary: " + DEFAULT_BINARY_NAME);

    return null;
  }

  @VisibleForTesting
  Map<String, String> getEnvironmentToRunCommand() {
    return environment;
  }

  @VisibleForTesting
  String getPathOfGpuBinary() {
    return pathOfGpuBinary;
  }
}

相关信息

hadoop 源码目录

相关文章

hadoop AssignedGpuDevice 源码

hadoop GpuDevice 源码

hadoop GpuDeviceSpecificationException 源码

hadoop GpuDockerCommandPluginFactory 源码

hadoop GpuNodeResourceUpdateHandler 源码

hadoop GpuResourcePlugin 源码

hadoop NvidiaBinaryHelper 源码

hadoop NvidiaDockerV1CommandPlugin 源码

hadoop NvidiaDockerV2CommandPlugin 源码

hadoop package-info 源码

0  赞