hadoop Separator 源码

  • 2022-10-20
  • 浏览 (219)

haddop Separator 代码

文件路径:/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-common/src/main/java/org/apache/hadoop/yarn/server/timelineservice/storage/common/Separator.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.hadoop.yarn.server.timelineservice.storage.common;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.hbase.util.Bytes;

/**
 * Used to separate row qualifiers, column qualifiers and compound fields.
 */
public enum Separator {

  /**
   * separator in key or column qualifier fields.
   */
  QUALIFIERS("!", "%0$"),

  /**
   * separator in values, and/or compound key/column qualifier fields.
   */
  VALUES("=", "%1$"),

  /**
   * separator in values, often used to avoid having these in qualifiers and
   * names. Note that if we use HTML form encoding through URLEncoder, we end up
   * getting a + for a space, which may already occur in strings, so we don't
   * want that.
   */
  SPACE(" ", "%2$"),

  /**
   * separator in values, often used to avoid having these in qualifiers and
   * names.
   */
  TAB("\t", "%3$");

  // a reserved character that starts each of the encoded values and is encoded
  // first in order to escape naturally occurring instances of encoded values
  // although it can be expressed as an enum instance, we define them as private
  // variables to hide it from callers
  private static final String PERCENT = "%";
  private static final String PERCENT_ENCODED = "%9$";

  private static final Pattern PERCENT_PATTERN =
      Pattern.compile(PERCENT, Pattern.LITERAL);
  private static final String PERCENT_REPLACEMENT =
      Matcher.quoteReplacement(PERCENT);

  private static final Pattern PERCENT_ENCODED_PATTERN =
      Pattern.compile(PERCENT_ENCODED, Pattern.LITERAL);
  private static final String PERCENT_ENCODED_REPLACEMENT =
      Matcher.quoteReplacement(PERCENT_ENCODED);

  /**
   * The string value of this separator.
   */
  private final String value;

  /**
   * The bye representation of value.
   */
  private final byte[] bytes;

  // pre-compiled patterns and quoted replacements for optimization
  private final Pattern valuePattern;
  private final String valueReplacement;

  private final Pattern encodedValuePattern;
  private final String encodedValueReplacement;

  /**
   * Indicator for variable size of an individual segment in a split. The
   * segment ends wherever separator is encountered.
   * Typically used for string.
   * Also used to indicate that there is no fixed number of splits which need to
   * be returned. If split limit is specified as this, all possible splits are
   * returned.
   */
  public static final int VARIABLE_SIZE = 0;


  /** empty string. */
  public static final String EMPTY_STRING = "";

  /** empty bytes. */
  public static final byte[] EMPTY_BYTES = new byte[0];

  /**
   * @param value of the separator to use. Cannot be null or empty string.
   * @param encodedValue choose something that isn't likely to occur in the data
   *          itself. Cannot be null or empty string.
   */
  private Separator(String value, String encodedValue) {
    this.value = value;

    // validation
    if (value == null || value.length() == 0 || encodedValue == null
        || encodedValue.length() == 0) {
      throw new IllegalArgumentException(
          "Cannot create separator from null or empty string.");
    }

    this.bytes = Bytes.toBytes(value);
    this.valuePattern = Pattern.compile(value, Pattern.LITERAL);
    this.valueReplacement = Matcher.quoteReplacement(value);

    this.encodedValuePattern = Pattern.compile(encodedValue, Pattern.LITERAL);
    this.encodedValueReplacement = Matcher.quoteReplacement(encodedValue);
  }

  /**
   * @return the original value of the separator
   */
  public String getValue() {
    return value;
  }

  /**
   * Used to make token safe to be used with this separator without collisions.
   * It <em>must</em> be paired with {@link #decode(String)} for it to be
   * decoded correctly.
   * <p>
   * If you need to encode a given string for multiple separators,
   * {@link #encode(String, Separator...)} should be used over successive
   * invocations of this method. It will result in a more compact version of the
   * encoded value.
   *
   * @param token Token to be encoded.
   * @return the token with any occurrences of this separator URLEncoded.
   */
  public String encode(String token) {
    if (token == null || token.length() == 0) {
      // Nothing to replace
      return token;
    }
    // first encode the percent to escape naturally occurring encoded values
    String escaped = encodePercent(token);
    return encodeSingle(escaped, this);
  }

  private static String replace(String token, Pattern pattern,
      String replacement) {
    return pattern.matcher(token).replaceAll(replacement);
  }

  private static String encodeSingle(String token, Separator separator) {
    return replace(token, separator.valuePattern,
        separator.encodedValueReplacement);
  }

  private static String encodePercent(String token) {
    return replace(token, PERCENT_PATTERN, PERCENT_ENCODED_REPLACEMENT);
  }

  /**
   * Decode the token encoded using {@link #encode(String)}. It <em>must</em> be
   * used for the result encoded with {@link #encode(String)} to be able to
   * recover the original.
   *
   * @param token Token to be decoded.
   * @return the token with any occurrences of the encoded separator replaced by
   *         the separator itself.
   */
  public String decode(String token) {
    if (token == null || token.length() == 0) {
      // Nothing to replace
      return token;
    }
    String escaped = decodeSingle(token, this);
    // decode percent to de-escape
    return decodePercent(escaped);
  }

  private static String decodeSingle(String token, Separator separator) {
    return replace(token, separator.encodedValuePattern,
        separator.valueReplacement);
  }

  private static String decodePercent(String token) {
    return replace(token, PERCENT_ENCODED_PATTERN, PERCENT_REPLACEMENT);
  }

  /**
   * Encode the given separators in the token with their encoding equivalents.
   * It <em>must</em> be paired with {@link #decode(byte[], Separator...)} or
   * {@link #decode(String, Separator...)} with the same separators for it to be
   * decoded correctly.
   * <p>
   * If you need to encode a given string for multiple separators, this form of
   * encoding should be used over successive invocations of
   * {@link #encode(String)}. It will result in a more compact version of the
   * encoded value.
   *
   * @param token containing possible separators that need to be encoded.
   * @param separators to be encoded in the token with their URLEncoding
   *          equivalent.
   * @return non-null byte representation of the token with occurrences of the
   *         separators encoded.
   */
  public static byte[] encode(String token, Separator... separators) {
    if (token == null || token.length() == 0) {
      return EMPTY_BYTES;
    }
    String result = token;
    // first encode the percent to escape naturally occurring encoded values
    result = encodePercent(token);
    for (Separator separator : separators) {
      if (separator != null) {
        result = encodeSingle(result, separator);
      }
    }
    return Bytes.toBytes(result);
  }

  /**
   * Decode the given separators in the token with their decoding equivalents.
   * It <em>must</em> be used for the result encoded with
   * {@link #encode(String, Separator...)} with the same separators to be able
   * to recover the original.
   *
   * @param token containing possible separators that need to be encoded.
   * @param separators to be encoded in the token with their URLEncoding
   *          equivalent.
   * @return String representation of the token with occurrences of the URL
   *         encoded separators decoded.
   */
  public static String decode(byte[] token, Separator... separators) {
    if (token == null) {
      return null;
    }
    return decode(Bytes.toString(token), separators);
  }

  /**
   * Decode the given separators in the token with their decoding equivalents.
   * It <em>must</em> be used for the result encoded with
   * {@link #encode(String, Separator...)} with the same separators to be able
   * to recover the original.
   *
   * @param token containing possible separators that need to be encoded.
   * @param separators to be encoded in the token with their URLEncoding
   *          equivalent.
   * @return String representation of the token with occurrences of the URL
   *         encoded separators decoded.
   */
  public static String decode(String token, Separator... separators) {
    if (token == null) {
      return null;
    }
    String result = token;
    for (Separator separator : separators) {
      if (separator != null) {
        result = decodeSingle(result, separator);
      }
    }
    // decode percent to de-escape
    return decodePercent(result);
  }

  /**
   * Returns a single byte array containing all of the individual arrays
   * components separated by this separator.
   *
   * @param components Byte array components to be joined together.
   * @return byte array after joining the components
   */
  public byte[] join(byte[]... components) {
    if (components == null || components.length == 0) {
      return EMPTY_BYTES;
    }

    int finalSize = 0;
    finalSize = this.value.length() * (components.length - 1);
    for (byte[] comp : components) {
      if (comp != null) {
        finalSize += comp.length;
      }
    }

    byte[] buf = new byte[finalSize];
    int offset = 0;
    for (int i = 0; i < components.length; i++) {
      if (components[i] != null) {
        System.arraycopy(components[i], 0, buf, offset, components[i].length);
        offset += components[i].length;
      }
      if (i < (components.length - 1)) {
        System.arraycopy(this.bytes, 0, buf, offset, this.value.length());
        offset += this.value.length();
      }
    }
    return buf;
  }

  /**
   * Concatenates items (as String), using this separator.
   *
   * @param items Items join, {@code toString()} will be called in each item.
   *          Any occurrence of the separator in the individual strings will be
   *          first encoded. Cannot be null.
   * @return non-null joined result. Note that when separator is {@literal null}
   *         the result is simply all items concatenated and the process is not
   *         reversible through {@link #splitEncoded(String)}
   */
  public String joinEncoded(String... items) {
    if (items == null || items.length == 0) {
      return "";
    }

    StringBuilder sb = new StringBuilder(encode(items[0].toString()));
    // Start at 1, we've already grabbed the first value at index 0
    for (int i = 1; i < items.length; i++) {
      sb.append(this.value)
          .append(encode(items[i].toString()));
    }

    return sb.toString();
  }

  /**
   * Concatenates items (as String), using this separator.
   *
   * @param items Items join, {@code toString()} will be called in each item.
   *          Any occurrence of the separator in the individual strings will be
   *          first encoded. Cannot be null.
   * @return non-null joined result. Note that when separator is {@literal null}
   *         the result is simply all items concatenated and the process is not
   *         reversible through {@link #splitEncoded(String)}
   */
  public String joinEncoded(Iterable<?> items) {
    if (items == null) {
      return "";
    }
    Iterator<?> i = items.iterator();
    if (!i.hasNext()) {
      return "";
    }

    StringBuilder sb = new StringBuilder(encode(i.next().toString()));
    while (i.hasNext()) {
      sb.append(this.value);
      sb.append(encode(i.next().toString()));
    }

    return sb.toString();
  }

  /**
   * @param compoundValue containing individual values separated by this
   *          separator, which have that separator encoded.
   * @return non-null set of values from the compoundValue with the separator
   *         decoded.
   */
  public Collection<String> splitEncoded(String compoundValue) {
    List<String> result = new ArrayList<String>();
    if (compoundValue != null) {
      for (String val : valuePattern.split(compoundValue)) {
        result.add(decode(val));
      }
    }
    return result;
  }

  /**
   * Splits the source array into multiple array segments using this separator,
   * up to a maximum of count items. This will naturally produce copied byte
   * arrays for each of the split segments.
   *
   * @param source to be split
   * @param limit on how many segments are supposed to be returned. A
   *          non-positive value indicates no limit on number of segments.
   * @return source split by this separator.
   */
  public byte[][] split(byte[] source, int limit) {
    return split(source, this.bytes, limit);
  }

  /**
   * Splits the source array into multiple array segments using this separator.
   * The sizes indicate the sizes of the relative components/segments.
   * In case one of the segments contains this separator before the specified
   * size is reached, the separator will be considered part of that segment and
   * we will continue till size is reached.
   * Variable length strings cannot contain this separator and are indiced with
   * a size of {@value #VARIABLE_SIZE}. Such strings are encoded for this
   * separator and decoded after the results from split is returned.
   *
   * @param source byte array to be split.
   * @param sizes sizes of relative components/segments.
   * @return source split by this separator as per the sizes specified..
   */
  public byte[][] split(byte[] source, int[] sizes) {
    return split(source, this.bytes, sizes);
  }

  /**
   * Splits the source array into multiple array segments using this separator,
   * as many times as splits are found. This will naturally produce copied byte
   * arrays for each of the split segments.
   *
   * @param source byte array to be split
   * @return source split by this separator.
   */
  public byte[][] split(byte[] source) {
    return split(source, this.bytes);
  }

  /**
   * Returns a list of ranges identifying [start, end) -- closed, open --
   * positions within the source byte array that would be split using the
   * separator byte array.
   * The sizes indicate the sizes of the relative components/segments.
   * In case one of the segments contains this separator before the specified
   * size is reached, the separator will be considered part of that segment and
   * we will continue till size is reached.
   * Variable length strings cannot contain this separator and are indiced with
   * a size of {@value #VARIABLE_SIZE}. Such strings are encoded for this
   * separator and decoded after the results from split is returned.
   *
   * @param source the source data
   * @param separator the separator pattern to look for
   * @param sizes indicate the sizes of the relative components/segments.
   * @return a list of ranges.
   */
  private static List<Range> splitRanges(byte[] source, byte[] separator,
      int[] sizes) {
    List<Range> segments = new ArrayList<Range>();
    if (source == null || separator == null) {
      return segments;
    }
    // VARIABLE_SIZE here indicates that there is no limit to number of segments
    // to return.
    int limit = VARIABLE_SIZE;
    if (sizes != null && sizes.length > 0) {
      limit = sizes.length;
    }
    int start = 0;
    int currentSegment = 0;
    itersource: for (int i = 0; i < source.length; i++) {
      for (int j = 0; j < separator.length; j++) {
        if (source[i + j] != separator[j]) {
          continue itersource;
        }
      }
      // all separator elements matched
      if (limit > VARIABLE_SIZE) {
        if (segments.size() >= (limit - 1)) {
          // everything else goes in one final segment
          break;
        }
        if (sizes != null) {
          int currentSegExpectedSize = sizes[currentSegment];
          if (currentSegExpectedSize > VARIABLE_SIZE) {
            int currentSegSize = i - start;
            if (currentSegSize < currentSegExpectedSize) {
              // Segment not yet complete. More bytes to parse.
              continue itersource;
            } else if (currentSegSize > currentSegExpectedSize) {
              // Segment is not as per size.
              throw new IllegalArgumentException(
                  "Segments not separated as per expected sizes");
            }
          }
        }
      }
      segments.add(new Range(start, i));
      start = i + separator.length;
      // i will be incremented again in outer for loop
      i += separator.length - 1;
      currentSegment++;
    }
    // add in remaining to a final range
    if (start <= source.length) {
      if (sizes != null) {
        // Check if final segment is as per size specified.
        if (sizes[currentSegment] > VARIABLE_SIZE &&
            source.length - start > sizes[currentSegment]) {
          // Segment is not as per size.
          throw new IllegalArgumentException(
              "Segments not separated as per expected sizes");
        }
      }
      segments.add(new Range(start, source.length));
    }
    return segments;
  }

  /**
   * Splits based on segments calculated based on limit/sizes specified for the
   * separator.
   *
   * @param source byte array to be split.
   * @param segments specifies the range for each segment.
   * @return a byte[][] split as per the segment ranges.
   */
  private static byte[][] split(byte[] source, List<Range> segments) {
    byte[][] splits = new byte[segments.size()][];
    for (int i = 0; i < segments.size(); i++) {
      Range r = segments.get(i);
      byte[] tmp = new byte[r.length()];
      if (tmp.length > 0) {
        System.arraycopy(source, r.start(), tmp, 0, r.length());
      }
      splits[i] = tmp;
    }
    return splits;
  }

  /**
   * Splits the source array into multiple array segments using the given
   * separator based on the sizes. This will naturally produce copied byte
   * arrays for each of the split segments.
   *
   * @param source source array.
   * @param separator separator represented as a byte array.
   * @param sizes sizes of relative components/segments.
   * @return byte[][] after splitting the source.
   */
  private static byte[][] split(byte[] source, byte[] separator, int[] sizes) {
    List<Range> segments = splitRanges(source, separator, sizes);
    return split(source, segments);
  }

  /**
   * Splits the source array into multiple array segments using the given
   * separator. This will naturally produce copied byte arrays for each of the
   * split segments.
   *
   * @param source Source array.
   * @param separator Separator represented as a byte array.
   * @return byte[][] after splitting the source.
   */
  private static byte[][] split(byte[] source, byte[] separator) {
    return split(source, separator, (int[]) null);
  }

  /**
   * Splits the source array into multiple array segments using the given
   * separator, up to a maximum of count items. This will naturally produce
   * copied byte arrays for each of the split segments.
   *
   * @param source Source array.
   * @param separator Separator represented as a byte array.
   * @param limit a non-positive value indicates no limit on number of segments.
   * @return byte[][] after splitting the input source.
   */
  private static byte[][] split(byte[] source, byte[] separator, int limit) {
    int[] sizes = null;
    if (limit > VARIABLE_SIZE) {
      sizes = new int[limit];
    }
    List<Range> segments = splitRanges(source, separator, sizes);
    return split(source, segments);
  }
}

相关信息

hadoop 源码目录

相关文章

hadoop AppIdKeyConverter 源码

hadoop BaseTable 源码

hadoop Column 源码

hadoop ColumnFamily 源码

hadoop ColumnHelper 源码

hadoop ColumnPrefix 源码

hadoop EventColumnName 源码

hadoop EventColumnNameConverter 源码

hadoop GenericConverter 源码

hadoop HBaseTimelineSchemaUtils 源码

0  赞