Skip to content

Commit

Permalink
HDFS-9818. Correctly handle EC reconstruction work caused by not enou…
Browse files Browse the repository at this point in the history
…gh racks. Contributed by Jing Zhao.
  • Loading branch information
Jing9 committed Feb 20, 2016
1 parent 6eae433 commit e54cc29
Show file tree
Hide file tree
Showing 12 changed files with 388 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,12 @@ protected void closeCurrentBlockReaders() {

private void closeReader(BlockReaderInfo readerInfo) {
if (readerInfo != null) {
// IOUtils.cleanup(null, readerInfo.reader);
if (readerInfo.reader != null) {
try {
readerInfo.reader.close();
} catch (Throwable ignored) {
}
}
readerInfo.skip();
}
}
Expand Down
3 changes: 3 additions & 0 deletions hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,9 @@ Trunk (Unreleased)
HDFS-9794. Streamer threads may leak if failure happens when closing the
striped outputstream. (jing9)

HDFS-9818. Correctly handle EC reconstruction work caused by not enough
racks. (jing9)

BREAKDOWN OF HDFS-7285 SUBTASKS AND RELATED JIRAS

HDFS-7347. Configurable erasure coding policy for individual files and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1628,7 +1628,7 @@ private BlockReconstructionWork scheduleReconstruction(BlockInfo block,
for (int i = 0 ; i < liveBlockIndices.size(); i++) {
indices[i] = liveBlockIndices.get(i);
}
return new ErasureCodingWork(block, bc, srcNodes,
return new ErasureCodingWork(getBlockPoolId(), block, bc, srcNodes,
containingNodes, liveReplicaNodes, additionalReplRequired,
priority, indices);
} else {
Expand All @@ -1638,6 +1638,16 @@ private BlockReconstructionWork scheduleReconstruction(BlockInfo block,
}
}

private boolean isInNewRack(DatanodeDescriptor[] srcs,
DatanodeDescriptor target) {
for (DatanodeDescriptor src : srcs) {
if (src.getNetworkLocation().equals(target.getNetworkLocation())) {
return false;
}
}
return true;
}

private boolean validateReconstructionWork(BlockReconstructionWork rw) {
BlockInfo block = rw.getBlock();
int priority = rw.getPriority();
Expand Down Expand Up @@ -1665,31 +1675,14 @@ private boolean validateReconstructionWork(BlockReconstructionWork rw) {
DatanodeStorageInfo[] targets = rw.getTargets();
if ( (numReplicas.liveReplicas() >= requiredReplication) &&
(!isPlacementPolicySatisfied(block)) ) {
if (rw.getSrcNodes()[0].getNetworkLocation().equals(
targets[0].getDatanodeDescriptor().getNetworkLocation())) {
//No use continuing, unless a new rack in this case
if (!isInNewRack(rw.getSrcNodes(), targets[0].getDatanodeDescriptor())) {
// No use continuing, unless a new rack in this case
return false;
}
}

// Add block to the to be reconstructed list
if (block.isStriped()) {
assert rw instanceof ErasureCodingWork;
assert rw.getTargets().length > 0;
assert pendingNum == 0 : "Should wait the previous reconstruction"
+ " to finish";
final ErasureCodingPolicy ecPolicy =
((BlockInfoStriped) block).getErasureCodingPolicy();
assert ecPolicy != null;

rw.getTargets()[0].getDatanodeDescriptor().addBlockToBeErasureCoded(
new ExtendedBlock(getBlockPoolId(), block),
rw.getSrcNodes(), rw.getTargets(),
((ErasureCodingWork) rw).getLiveBlockIndicies(), ecPolicy);
} else {
rw.getSrcNodes()[0].addBlockToBeReplicated(block, targets);
}

// Add block to the datanode's task list
rw.addTaskToDatanode();
DatanodeStorageInfo.incrementBlocksScheduled(targets);

// Move the block-replication into a "pending" state.
Expand Down Expand Up @@ -3973,7 +3966,8 @@ boolean isPlacementPolicySatisfied(BlockInfo storedBlock) {
.getPolicy(storedBlock.isStriped());
int numReplicas = storedBlock.isStriped() ? ((BlockInfoStriped) storedBlock)
.getRealDataBlockNum() : storedBlock.getReplication();
return placementPolicy.verifyBlockPlacement(locs, numReplicas).isPlacementPolicySatisfied();
return placementPolicy.verifyBlockPlacement(locs, numReplicas)
.isPlacementPolicySatisfied();
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -901,7 +901,7 @@ public BlockPlacementStatus verifyBlockPlacement(DatanodeInfo[] locs,
locs = DatanodeDescriptor.EMPTY_ARRAY;
if (!clusterMap.hasClusterEverBeenMultiRack()) {
// only one rack
return new BlockPlacementStatusDefault(1, 1);
return new BlockPlacementStatusDefault(1, 1, 1);
}
int minRacks = 2;
minRacks = Math.min(minRacks, numberOfReplicas);
Expand All @@ -910,7 +910,8 @@ public BlockPlacementStatus verifyBlockPlacement(DatanodeInfo[] locs,
Set<String> racks = new TreeSet<>();
for (DatanodeInfo dn : locs)
racks.add(dn.getNetworkLocation());
return new BlockPlacementStatusDefault(racks.size(), minRacks);
return new BlockPlacementStatusDefault(racks.size(), minRacks,
clusterMap.getNumOfRacks());
}
/**
* Decide whether deleting the specified replica of the block still makes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,14 +160,16 @@ public BlockPlacementStatus verifyBlockPlacement(DatanodeInfo[] locs,
locs = DatanodeDescriptor.EMPTY_ARRAY;
if (!clusterMap.hasClusterEverBeenMultiRack()) {
// only one rack
return new BlockPlacementStatusDefault(1, 1);
return new BlockPlacementStatusDefault(1, 1, 1);
}
// 1. Check that all locations are different.
// 2. Count locations on different racks.
Set<String> racks = new TreeSet<String>();
for (DatanodeInfo dn : locs)
Set<String> racks = new TreeSet<>();
for (DatanodeInfo dn : locs) {
racks.add(dn.getNetworkLocation());
return new BlockPlacementStatusDefault(racks.size(), numberOfReplicas);
}
return new BlockPlacementStatusDefault(racks.size(), numberOfReplicas,
clusterMap.getNumOfRacks());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,18 @@ public class BlockPlacementStatusDefault implements BlockPlacementStatus {

private int requiredRacks = 0;
private int currentRacks = 0;
private final int totalRacks;

public BlockPlacementStatusDefault(int currentRacks, int requiredRacks){
public BlockPlacementStatusDefault(int currentRacks, int requiredRacks,
int totalRacks){
this.requiredRacks = requiredRacks;
this.currentRacks = currentRacks;
this.totalRacks = totalRacks;
}

@Override
public boolean isPlacementPolicySatisfied() {
return requiredRacks <= currentRacks;
return requiredRacks <= currentRacks || currentRacks >= totalRacks;
}

@Override
Expand All @@ -38,7 +41,8 @@ public String getErrorDescription() {
return null;
}
return "Block should be additionally replicated on " +
(requiredRacks - currentRacks) + " more rack(s).";
(requiredRacks - currentRacks) +
" more rack(s). Total number of racks in the cluster: " + totalRacks;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,9 @@ public int getAdditionalReplRequired() {
abstract void chooseTargets(BlockPlacementPolicy blockplacement,
BlockStoragePolicySuite storagePolicySuite,
Set<Node> excludedNodes);

/**
* add reconstruction task into a source datanode
*/
abstract void addTaskToDatanode();
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,23 @@
*/
package org.apache.hadoop.hdfs.server.blockmanagement;

import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.util.StripedBlockUtil;
import org.apache.hadoop.net.Node;

import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

class ErasureCodingWork extends BlockReconstructionWork {
private final byte[] liveBlockIndicies;
private final String blockPoolId;

public ErasureCodingWork(BlockInfo block,
public ErasureCodingWork(String blockPoolId, BlockInfo block,
BlockCollection bc,
DatanodeDescriptor[] srcNodes,
List<DatanodeDescriptor> containingNodes,
Expand All @@ -34,6 +42,7 @@ public ErasureCodingWork(BlockInfo block,
int priority, byte[] liveBlockIndicies) {
super(block, bc, srcNodes, containingNodes,
liveReplicaStorages, additionalReplRequired, priority);
this.blockPoolId = blockPoolId;
this.liveBlockIndicies = liveBlockIndicies;
BlockManager.LOG.debug("Creating an ErasureCodingWork to {} reconstruct ",
block);
Expand All @@ -47,15 +56,92 @@ byte[] getLiveBlockIndicies() {
void chooseTargets(BlockPlacementPolicy blockplacement,
BlockStoragePolicySuite storagePolicySuite,
Set<Node> excludedNodes) {
try {
// TODO: new placement policy for EC considering multiple writers
DatanodeStorageInfo[] chosenTargets = blockplacement.chooseTarget(
getBc().getName(), getAdditionalReplRequired(), getSrcNodes()[0],
getLiveReplicaStorages(), false, excludedNodes,
getBlock().getNumBytes(),
storagePolicySuite.getPolicy(getBc().getStoragePolicyID()));
setTargets(chosenTargets);
} finally {
// TODO: new placement policy for EC considering multiple writers
DatanodeStorageInfo[] chosenTargets = blockplacement.chooseTarget(
getBc().getName(), getAdditionalReplRequired(), getSrcNodes()[0],
getLiveReplicaStorages(), false, excludedNodes,
getBlock().getNumBytes(),
storagePolicySuite.getPolicy(getBc().getStoragePolicyID()));
setTargets(chosenTargets);
}

/**
* @return true if the current source nodes cover all the internal blocks.
* I.e., we only need to have more racks.
*/
private boolean hasAllInternalBlocks() {
final BlockInfoStriped block = (BlockInfoStriped) getBlock();
if (getSrcNodes().length < block.getRealTotalBlockNum()) {
return false;
}
BitSet bitSet = new BitSet(block.getTotalBlockNum());
for (byte index : liveBlockIndicies) {
bitSet.set(index);
}
for (int i = 0; i < block.getRealDataBlockNum(); i++) {
if (!bitSet.get(i)) {
return false;
}
}
for (int i = block.getDataBlockNum(); i < block.getTotalBlockNum(); i++) {
if (!bitSet.get(i)) {
return false;
}
}
return true;
}

/**
* We have all the internal blocks but not enough racks. Thus we do not need
* to do decoding but only simply make an extra copy of an internal block. In
* this scenario, use this method to choose the source datanode for simple
* replication.
* @return The index of the source datanode.
*/
private int chooseSource4SimpleReplication() {
Map<String, List<Integer>> map = new HashMap<>();
for (int i = 0; i < getSrcNodes().length; i++) {
final String rack = getSrcNodes()[i].getNetworkLocation();
List<Integer> dnList = map.get(rack);
if (dnList == null) {
dnList = new ArrayList<>();
map.put(rack, dnList);
}
dnList.add(i);
}
List<Integer> max = null;
for (Map.Entry<String, List<Integer>> entry : map.entrySet()) {
if (max == null || entry.getValue().size() > max.size()) {
max = entry.getValue();
}
}
assert max != null;
return max.get(0);
}

@Override
void addTaskToDatanode() {
assert getTargets().length > 0;
BlockInfoStriped stripedBlk = (BlockInfoStriped) getBlock();

// if we already have all the internal blocks, but not enough racks,
// we only need to replicate one internal block to a new rack
if (hasAllInternalBlocks()) {
int sourceIndex = chooseSource4SimpleReplication();
final byte blockIndex = liveBlockIndicies[sourceIndex];
final DatanodeDescriptor source = getSrcNodes()[sourceIndex];
final long internBlkLen = StripedBlockUtil.getInternalBlockLength(
stripedBlk.getNumBytes(), stripedBlk.getCellSize(),
stripedBlk.getDataBlockNum(), blockIndex);
final Block targetBlk = new Block(
stripedBlk.getBlockId() + blockIndex, internBlkLen,
stripedBlk.getGenerationStamp());
source.addBlockToBeReplicated(targetBlk, getTargets());
} else {
getTargets()[0].getDatanodeDescriptor().addBlockToBeErasureCoded(
new ExtendedBlock(blockPoolId, stripedBlk),
getSrcNodes(), getTargets(), getLiveBlockIndicies(),
stripedBlk.getErasureCodingPolicy());
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,9 @@ assert getSrcNodes().length > 0
getSrcNodes()[0].decrementPendingReplicationWithoutTargets();
}
}

@Override
void addTaskToDatanode() {
getSrcNodes()[0].addBlockToBeReplicated(getBlock(), getTargets());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,14 @@ public void processErasureCodingTasks(
Collection<BlockECReconstructionInfo> ecTasks) {
for (BlockECReconstructionInfo reconstructionInfo : ecTasks) {
try {
EC_RECONSTRUCTION_STRIPED_BLK_THREAD_POOL
.submit(new ReconstructAndTransferBlock(reconstructionInfo));
ReconstructAndTransferBlock task =
new ReconstructAndTransferBlock(reconstructionInfo);
if (task.hasValidTargets()) {
EC_RECONSTRUCTION_STRIPED_BLK_THREAD_POOL.submit(task);
} else {
LOG.warn("No missing internal block. Skip reconstruction for task:{}",
reconstructionInfo);
}
} catch (Throwable e) {
LOG.warn("Failed to reconstruct striped block {}",
reconstructionInfo.getExtendedBlock().getLocalBlock(), e);
Expand Down Expand Up @@ -292,6 +298,7 @@ private class ReconstructAndTransferBlock implements Runnable {
private final CompletionService<Void> readService =
new ExecutorCompletionService<>(
EC_RECONSTRUCTION_STRIPED_READ_THREAD_POOL);
private final boolean hasValidTargets;

ReconstructAndTransferBlock(BlockECReconstructionInfo reconstructionInfo) {
ErasureCodingPolicy ecPolicy = reconstructionInfo
Expand Down Expand Up @@ -339,10 +346,14 @@ private class ReconstructAndTransferBlock implements Runnable {
seqNo4Targets[i] = 0;
}

getTargetIndices();
hasValidTargets = getTargetIndices();
cachingStrategy = CachingStrategy.newDefaultStrategy();
}

boolean hasValidTargets() {
return hasValidTargets;
}

private ByteBuffer allocateBuffer(int length) {
return ByteBuffer.allocate(length);
}
Expand Down Expand Up @@ -505,24 +516,30 @@ private void initChecksumAndBufferSizeIfNeeded(BlockReader blockReader) {
}
}

private void getTargetIndices() {
/**
* @return true if there is valid target for reconstruction
*/
private boolean getTargetIndices() {
BitSet bitset = new BitSet(dataBlkNum + parityBlkNum);
for (int i = 0; i < sources.length; i++) {
bitset.set(liveIndices[i]);
}
int m = 0;
int k = 0;
boolean hasValidTarget = false;
for (int i = 0; i < dataBlkNum + parityBlkNum; i++) {
if (!bitset.get(i)) {
if (getBlockLen(blockGroup, i) > 0) {
if (m < targets.length) {
targetIndices[m++] = (short)i;
hasValidTarget = true;
}
} else {
zeroStripeIndices[k++] = (short)i;
}
}
}
return hasValidTarget;
}

/** the reading length should not exceed the length for reconstruction. */
Expand Down
Loading

0 comments on commit e54cc29

Please sign in to comment.