hadoop StandbyCheckpointer 源码

  • 2022-10-20
  • 浏览 (51)

haddop StandbyCheckpointer 代码


 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package org.apache.hadoop.hdfs.server.namenode.ha;

import static org.apache.hadoop.util.Time.monotonicNow;

import java.io.IOException;
import java.net.URI;
import java.net.URL;
import java.security.PrivilegedAction;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.*;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.ha.ServiceFailedException;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.server.namenode.CheckpointConf;
import org.apache.hadoop.hdfs.server.namenode.CheckpointFaultInjector;
import org.apache.hadoop.hdfs.server.namenode.FSImage;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.SaveNamespaceCancelledException;
import org.apache.hadoop.hdfs.server.namenode.TransferFsImage;
import org.apache.hadoop.hdfs.util.Canceler;
import org.apache.hadoop.io.MultipleIOException;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.Lists;

import org.apache.hadoop.classification.VisibleForTesting;
import org.apache.hadoop.util.Preconditions;
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

 * Thread which runs inside the NN when it's in Standby state,
 * periodically waking up to take a checkpoint of the namespace.
 * When it takes a checkpoint, it saves it to its local
 * storage and then uploads it to the remote NameNode.
public class StandbyCheckpointer {
  private static final Logger LOG =
  private static final long PREVENT_AFTER_CANCEL_MS = 2*60*1000L;
  private final CheckpointConf checkpointConf;
  private final Configuration conf;
  private final FSNamesystem namesystem;
  private long lastCheckpointTime;
  private final CheckpointerThread thread;
  private final ThreadFactory uploadThreadFactory;
  private List<URL> activeNNAddresses;
  private URL myNNAddress;

  private final Object cancelLock = new Object();
  private Canceler canceler;

  // Keep track of how many checkpoints were canceled.
  // This is for use in tests.
  private static int canceledCount = 0;

  // A map from NN url to the most recent image upload time.
  private final HashMap<String, CheckpointReceiverEntry> checkpointReceivers;
  public StandbyCheckpointer(Configuration conf, FSNamesystem ns)
      throws IOException {
    this.namesystem = ns;
    this.conf = conf;
    this.checkpointConf = new CheckpointConf(conf); 
    this.thread = new CheckpointerThread();
    this.uploadThreadFactory = new ThreadFactoryBuilder().setDaemon(true)
    this.checkpointReceivers = new HashMap<>();
    for (URL address : activeNNAddresses) {
          new CheckpointReceiverEntry());

  private static final class CheckpointReceiverEntry {
    private long lastUploadTime;
    private boolean isPrimary;

    CheckpointReceiverEntry() {
      this.lastUploadTime = 0L;
      this.isPrimary = true;

    void setLastUploadTime(long lastUploadTime) {
      this.lastUploadTime = lastUploadTime;

    void setIsPrimary(boolean isPrimaryFor) {
      this.isPrimary = isPrimaryFor;

    long getLastUploadTime() {
      return lastUploadTime;

    boolean isPrimary() {
      return isPrimary;

   * Determine the address of the NN we are checkpointing
   * as well as our own HTTP address from the configuration.
   * @throws IOException 
  private void setNameNodeAddresses(Configuration conf) throws IOException {
    // Look up our own address.
    myNNAddress = getHttpAddress(conf);

    // Look up the active node's address
    List<Configuration> confForActive = HAUtil.getConfForOtherNodes(conf);
    activeNNAddresses = new ArrayList<URL>(confForActive.size());
    for (Configuration activeConf : confForActive) {
      URL activeNNAddress = getHttpAddress(activeConf);

      // sanity check each possible active NN
          "Bad address for active NN: %s", activeNNAddress);


    // Sanity-check.
    Preconditions.checkArgument(checkAddress(myNNAddress), "Bad address for standby NN: %s",
  private URL getHttpAddress(Configuration conf) throws IOException {
    final String scheme = DFSUtil.getHttpClientScheme(conf);
    String defaultHost = NameNode.getServiceAddress(conf, true).getHostName();
    URI addr = DFSUtil.getInfoServerWithDefaultHost(defaultHost, conf, scheme);
    return addr.toURL();
   * Ensure that the given address is valid and has a port
   * specified.
  private static boolean checkAddress(URL addr) {
    return addr.getPort() != 0;

  public void start() {
    LOG.info("Starting standby checkpoint thread...\n" +
        "Checkpointing active NN to possible NNs: {}\n" +
        "Serving checkpoints at {}", activeNNAddresses, myNNAddress);
  public void stop() throws IOException {
    cancelAndPreventCheckpoints("Stopping checkpointer");
    try {
    } catch (InterruptedException e) {
      LOG.warn("Edit log tailer thread exited with an exception");
      throw new IOException(e);

  public void triggerRollbackCheckpoint() {

  private void doCheckpoint() throws InterruptedException, IOException {
    assert canceler != null;
    final long txid;
    final NameNodeFile imageType;
    // Acquire cpLock to make sure no one is modifying the name system.
    // It does not need the full namesystem write lock, since the only thing
    // that modifies namesystem on standby node is edit log replaying.
    try {
      assert namesystem.getEditLog().isOpenForRead() :
        "Standby Checkpointer should only attempt a checkpoint when " +
        "NN is in standby mode, but the edit logs are in an unexpected state";

      FSImage img = namesystem.getFSImage();

      long prevCheckpointTxId = img.getStorage().getMostRecentCheckpointTxId();
      long thisCheckpointTxId = img.getCorrectLastAppliedOrWrittenTxId();
      assert thisCheckpointTxId >= prevCheckpointTxId;
      if (thisCheckpointTxId == prevCheckpointTxId) {
        LOG.info("A checkpoint was triggered but the Standby Node has not " +
            "received any transactions since the last checkpoint at txid {}. " +
            "Skipping...", thisCheckpointTxId);

      if (namesystem.isRollingUpgrade()
          && !namesystem.getFSImage().hasRollbackFSImage()) {
        // if we will do rolling upgrade but have not created the rollback image
        // yet, name this checkpoint as fsimage_rollback
        imageType = NameNodeFile.IMAGE_ROLLBACK;
      } else {
        imageType = NameNodeFile.IMAGE;
      img.saveNamespace(namesystem, imageType, canceler);
      txid = img.getStorage().getMostRecentCheckpointTxId();
      assert txid == thisCheckpointTxId : "expected to save checkpoint at txid=" +
          thisCheckpointTxId + " but instead saved at txid=" + txid;

      // Save the legacy OIV image, if the output dir is defined.
      String outputDir = checkpointConf.getLegacyOivImageDir();
      if (outputDir != null && !outputDir.isEmpty()) {
        try {
          img.saveLegacyOIVImage(namesystem, outputDir, canceler);
        } catch (IOException ioe) {
          LOG.warn("Exception encountered while saving legacy OIV image; "
                  + "continuing with other checkpointing steps", ioe);
    } finally {

    // Upload the saved checkpoint back to the active
    // Do this in a separate thread to avoid blocking transition to active, but don't allow more
    // than the expected number of tasks to run or queue up
    // See HDFS-4816
    ExecutorService executor = new ThreadPoolExecutor(0, activeNNAddresses.size(), 100,
        TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>(activeNNAddresses.size()),
    // for right now, just match the upload to the nn address by convention. There is no need to
    // directly tie them together by adding a pair class.
    HashMap<String, Future<TransferFsImage.TransferResult>> uploads =
        new HashMap<>();
    for (final URL activeNNAddress : activeNNAddresses) {
      // Upload image if at least 1 of 2 following conditions met:
      // 1. has been quiet for long enough, try to contact the node.
      // 2. this standby IS the primary checkpointer of target NN.
      String addressString = activeNNAddress.toString();
      assert checkpointReceivers.containsKey(addressString);
      CheckpointReceiverEntry receiverEntry =
      long secsSinceLastUpload =
              monotonicNow() - receiverEntry.getLastUploadTime());
      boolean shouldUpload = receiverEntry.isPrimary() ||
          secsSinceLastUpload >= checkpointConf.getQuietPeriod();
      if (shouldUpload) {
        Future<TransferFsImage.TransferResult> upload =
            executor.submit(new Callable<TransferFsImage.TransferResult>() {
              public TransferFsImage.TransferResult call()
                  throws IOException, InterruptedException {
                return TransferFsImage.uploadImageFromStorage(activeNNAddress,
                    conf, namesystem.getFSImage().getStorage(), imageType, txid,
        uploads.put(addressString, upload);
    InterruptedException ie = null;
    List<IOException> ioes = Lists.newArrayList();
    for (Map.Entry<String, Future<TransferFsImage.TransferResult>> entry :
        uploads.entrySet()) {
      String url = entry.getKey();
      Future<TransferFsImage.TransferResult> upload = entry.getValue();
      try {
        // TODO should there be some smarts here about retries nodes that
        //  are not the active NN?
        CheckpointReceiverEntry receiverEntry = checkpointReceivers.get(url);
        TransferFsImage.TransferResult uploadResult = upload.get();
        if (uploadResult == TransferFsImage.TransferResult.SUCCESS) {
        } else {
          // Getting here means image upload is explicitly rejected
          // by the other node. This could happen if:
          // 1. the other is also a standby, or
          // 2. the other is active, but already accepted another
          // newer image, or
          // 3. the other is active but has a recent enough image.
          // All these are valid cases, just log for information.
          LOG.info("Image upload rejected by the other NameNode: {}",
      } catch (ExecutionException e) {
        // Even if exception happens, still proceeds to next NN url.
        // so that fail to upload to previous NN does not cause the
        // remaining NN not getting the fsImage.
        ioes.add(new IOException("Exception during image upload", e));
      } catch (InterruptedException e) {
        ie = e;
    // cleaner than copying code for multiple catch statements and better than catching all
    // exceptions, so we just handle the ones we expect.
    if (ie != null) {

      // cancel the rest of the tasks, and close the pool
      for (Map.Entry<String, Future<TransferFsImage.TransferResult>> entry :
          uploads.entrySet()) {
        Future<TransferFsImage.TransferResult> upload = entry.getValue();
        // The background thread may be blocked waiting in the throttler, so
        // interrupt it.

      // shutdown so we interrupt anything running and don't start anything new
      // this is a good bit longer than the thread timeout, just to make sure all the threads
      // that are not doing any work also stop
      executor.awaitTermination(500, TimeUnit.MILLISECONDS);

      // re-throw the exception we got, since one of these two must be non-null
      throw ie;

    if (!ioes.isEmpty()) {
      throw MultipleIOException.createIOException(ioes);
   * Cancel any checkpoint that's currently being made,
   * and prevent any new checkpoints from starting for the next
   * minute or so.
  public void cancelAndPreventCheckpoints(String msg) throws ServiceFailedException {
    synchronized (cancelLock) {
      // The checkpointer thread takes this lock and checks if checkpointing is
      // postponed. 

      // Before beginning a checkpoint, the checkpointer thread
      // takes this lock, and creates a canceler object.
      // If the canceler is non-null, then a checkpoint is in
      // progress and we need to cancel it. If it's null, then
      // the operation has not started, meaning that the above
      // time-based prevention will take effect.
      if (canceler != null) {
  static int getCanceledCount() {
    return canceledCount;

  private long countUncheckpointedTxns() {
    FSImage img = namesystem.getFSImage();
    return img.getCorrectLastAppliedOrWrittenTxId() -

  private class CheckpointerThread extends Thread {
    private volatile boolean shouldRun = true;
    private volatile long preventCheckpointsUntil = 0;

    private CheckpointerThread() {
      super("Standby State Checkpointer");
    private void setShouldRun(boolean shouldRun) {
      this.shouldRun = shouldRun;

    public void run() {
      // We have to make sure we're logged in as far as JAAS
      // is concerned, in order to use kerberized SSL properly.
          new PrivilegedAction<Object>() {
          public Object run() {
            return null;

     * Prevent checkpoints from occurring for some time period
     * in the future. This is used when preparing to enter active
     * mode. We need to not only cancel any concurrent checkpoint,
     * but also prevent any checkpoints from racing to start just
     * after the cancel call.
     * @param delayMs the number of MS for which checkpoints will be
     * prevented
    private void preventCheckpointsFor(long delayMs) {
      preventCheckpointsUntil = monotonicNow() + delayMs;

    private void doWork() {
      final long checkPeriod = 1000 * checkpointConf.getCheckPeriod();
      // Reset checkpoint time so that we don't always checkpoint
      // on startup.
      lastCheckpointTime = monotonicNow();
      while (shouldRun) {
        boolean needRollbackCheckpoint = namesystem.isNeedRollbackFsImage();
        if (!needRollbackCheckpoint) {
          try {
          } catch (InterruptedException ie) {
          if (!shouldRun) {
        try {
          // We may have lost our ticket since last checkpoint, log in again, just in case
          if (UserGroupInformation.isSecurityEnabled()) {
          final long now = monotonicNow();
          final long uncheckpointed = countUncheckpointedTxns();
          final long secsSinceLast = (now - lastCheckpointTime) / 1000;

          // if we need a rollback checkpoint, always attempt to checkpoint
          boolean needCheckpoint = needRollbackCheckpoint;

          if (needCheckpoint) {
            LOG.info("Triggering a rollback fsimage for rolling upgrade.");
          } else if (uncheckpointed >= checkpointConf.getTxnCount()) {
            LOG.info("Triggering checkpoint because there have been {} txns " +
                "since the last checkpoint, " +
                "which exceeds the configured threshold {}",
                uncheckpointed, checkpointConf.getTxnCount());
            needCheckpoint = true;
          } else if (secsSinceLast >= checkpointConf.getPeriod()) {
            LOG.info("Triggering checkpoint because it has been {} seconds " +
                "since the last checkpoint, which exceeds the configured " +
                "interval {}", secsSinceLast, checkpointConf.getPeriod());
            needCheckpoint = true;

          if (needCheckpoint) {
            synchronized (cancelLock) {
              if (now < preventCheckpointsUntil) {
                LOG.info("But skipping this checkpoint since we are about to failover!");
              assert canceler == null;
              canceler = new Canceler();

            // on all nodes, we build the checkpoint. However, we only ship the checkpoint if have a
            // rollback request, are the checkpointer, are outside the quiet period.

            // reset needRollbackCheckpoint to false only when we finish a ckpt
            // for rollback image
            if (needRollbackCheckpoint
                && namesystem.getFSImage().hasRollbackFSImage()) {
            lastCheckpointTime = now;
            LOG.info("Checkpoint finished successfully.");
        } catch (SaveNamespaceCancelledException ce) {
          LOG.info("Checkpoint was cancelled: {}", ce.getMessage());
        } catch (InterruptedException ie) {
          LOG.info("Interrupted during checkpointing", ie);
          // Probably requested shutdown.
        } catch (Throwable t) {
          LOG.error("Exception in doCheckpoint", t);
        } finally {
          synchronized (cancelLock) {
            canceler = null;

  List<URL> getActiveNNAddresses() {
    return activeNNAddresses;


hadoop 源码目录


hadoop ActiveState 源码

hadoop BootstrapStandby 源码

hadoop EditLogTailer 源码

hadoop HAContext 源码

hadoop HAState 源码

hadoop NameNodeHAProxyFactory 源码

hadoop RemoteNameNodeInfo 源码

hadoop StandbyState 源码

0  赞