1 /**
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19 package org.apache.hadoop.hbase.master;
20
21 import java.io.IOException;
22
23 import org.apache.commons.logging.Log;
24 import org.apache.commons.logging.LogFactory;
25 import org.apache.hadoop.hbase.TableName;
26 import org.apache.hadoop.hbase.HBaseTestingUtility;
27 import org.apache.hadoop.hbase.HConstants;
28 import org.apache.hadoop.hbase.HRegionInfo;
29 import org.apache.hadoop.hbase.LargeTests;
30 import org.apache.hadoop.hbase.client.HTable;
31 import org.apache.hadoop.hbase.client.Put;
32 import org.apache.hadoop.hbase.client.Result;
33 import org.apache.hadoop.hbase.client.ResultScanner;
34 import org.apache.hadoop.hbase.client.Scan;
35 import org.apache.hadoop.hbase.client.Durability;
36 import org.apache.hadoop.hbase.util.Bytes;
37 import org.junit.AfterClass;
38 import org.junit.Assert;
39 import org.junit.Before;
40 import org.junit.BeforeClass;
41 import org.junit.Ignore;
42 import org.junit.Test;
43 import org.junit.experimental.categories.Category;
44
45 /**
46 * Test transitions of state across the master. Sets up the cluster once and
47 * then runs a couple of tests.
48 */
49 @Category(LargeTests.class)
50 public class TestMasterTransitions {
51 private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class);
52 private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
53 private static final String TABLENAME = "master_transitions";
54 private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"),
55 Bytes.toBytes("b"), Bytes.toBytes("c")};
56
57 /**
58 * Start up a mini cluster and put a small table of many empty regions into it.
59 * @throws Exception
60 */
61 @BeforeClass public static void beforeAllTests() throws Exception {
62 TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true);
63 TEST_UTIL.startMiniCluster(2);
64 // Create a table of three families. This will assign a region.
65 TableName tableName = TableName.valueOf(TABLENAME);
66 TEST_UTIL.createTable(tableName, FAMILIES);
67 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
68 int countOfRegions = TEST_UTIL.createMultiRegions(t, getTestFamily());
69 TEST_UTIL.waitUntilAllRegionsAssigned(tableName);
70 addToEachStartKey(countOfRegions);
71 t.close();
72 }
73
74 @AfterClass public static void afterAllTests() throws Exception {
75 TEST_UTIL.shutdownMiniCluster();
76 }
77
78 @Before public void setup() throws IOException {
79 TEST_UTIL.ensureSomeRegionServersAvailable(2);
80 }
81
82 /**
83 * Listener for regionserver events testing hbase-2428 (Infinite loop of
84 * region closes if META region is offline). In particular, listen
85 * for the close of the 'metaServer' and when it comes in, requeue it with a
86 * delay as though there were an issue processing the shutdown. As part of
87 * the requeuing, send over a close of a region on 'otherServer' so it comes
88 * into a master that has its meta region marked as offline.
89 */
90 /*
91 static class HBase2428Listener implements RegionServerOperationListener {
92 // Map of what we've delayed so we don't do do repeated delays.
93 private final Set<RegionServerOperation> postponed =
94 new CopyOnWriteArraySet<RegionServerOperation>();
95 private boolean done = false;;
96 private boolean metaShutdownReceived = false;
97 private final HServerAddress metaAddress;
98 private final MiniHBaseCluster cluster;
99 private final int otherServerIndex;
100 private final HRegionInfo hri;
101 private int closeCount = 0;
102 static final int SERVER_DURATION = 3 * 1000;
103 static final int CLOSE_DURATION = 1 * 1000;
104
105 HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress,
106 final HRegionInfo closingHRI, final int otherServerIndex) {
107 this.cluster = c;
108 this.metaAddress = metaAddress;
109 this.hri = closingHRI;
110 this.otherServerIndex = otherServerIndex;
111 }
112
113 @Override
114 public boolean process(final RegionServerOperation op) throws IOException {
115 // If a regionserver shutdown and its of the meta server, then we want to
116 // delay the processing of the shutdown and send off a close of a region on
117 // the 'otherServer.
118 boolean result = true;
119 if (op instanceof ProcessServerShutdown) {
120 ProcessServerShutdown pss = (ProcessServerShutdown)op;
121 if (pss.getDeadServerAddress().equals(this.metaAddress)) {
122 // Don't postpone more than once.
123 if (!this.postponed.contains(pss)) {
124 // Close some region.
125 this.cluster.addMessageToSendRegionServer(this.otherServerIndex,
126 new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri,
127 Bytes.toBytes("Forcing close in test")));
128 this.postponed.add(pss);
129 // Put off the processing of the regionserver shutdown processing.
130 pss.setDelay(SERVER_DURATION);
131 this.metaShutdownReceived = true;
132 // Return false. This will add this op to the delayed queue.
133 result = false;
134 }
135 }
136 } else {
137 // Have the close run frequently.
138 if (isWantedCloseOperation(op) != null) {
139 op.setDelay(CLOSE_DURATION);
140 // Count how many times it comes through here.
141 this.closeCount++;
142 }
143 }
144 return result;
145 }
146
147 public void processed(final RegionServerOperation op) {
148 if (isWantedCloseOperation(op) != null) return;
149 this.done = true;
150 }
151 */
152 /*
153 * @param op
154 * @return Null if not the wanted ProcessRegionClose, else <code>op</code>
155 * cast as a ProcessRegionClose.
156 */
157 /*
158 private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) {
159 // Count every time we get a close operation.
160 if (op instanceof ProcessRegionClose) {
161 ProcessRegionClose c = (ProcessRegionClose)op;
162 if (c.regionInfo.equals(hri)) {
163 return c;
164 }
165 }
166 return null;
167 }
168
169 boolean isDone() {
170 return this.done;
171 }
172
173 boolean isMetaShutdownReceived() {
174 return metaShutdownReceived;
175 }
176
177 int getCloseCount() {
178 return this.closeCount;
179 }
180
181 @Override
182 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
183 return true;
184 }
185 }
186 */
187 /**
188 * In 2428, the meta region has just been set offline and then a close comes
189 * in.
190 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a>
191 */
192 @Ignore @Test (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428()
193 throws Exception {
194 /*
195 LOG.info("Running testRegionCloseWhenNoMetaHBase2428");
196 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
197 final HMaster master = cluster.getMaster();
198 int metaIndex = cluster.getServerWithMeta();
199 // Figure the index of the server that is not server the .META.
200 int otherServerIndex = -1;
201 for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) {
202 if (i == metaIndex) continue;
203 otherServerIndex = i;
204 break;
205 }
206 final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex);
207 final HRegionServer metaHRS = cluster.getRegionServer(metaIndex);
208
209 // Get a region out on the otherServer.
210 final HRegionInfo hri =
211 otherServer.getOnlineRegions().iterator().next().getRegionInfo();
212
213 // Add our RegionServerOperationsListener
214 HBase2428Listener listener = new HBase2428Listener(cluster,
215 metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex);
216 master.getRegionServerOperationQueue().
217 registerRegionServerOperationListener(listener);
218 try {
219 // Now close the server carrying meta.
220 cluster.abortRegionServer(metaIndex);
221
222 // First wait on receipt of meta server shutdown message.
223 while(!listener.metaShutdownReceived) Threads.sleep(100);
224 while(!listener.isDone()) Threads.sleep(10);
225 // We should not have retried the close more times than it took for the
226 // server shutdown message to exit the delay queue and get processed
227 // (Multiple by two to add in some slop in case of GC or something).
228 assertTrue(listener.getCloseCount() > 1);
229 assertTrue(listener.getCloseCount() <
230 ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2));
231
232 // Assert the closed region came back online
233 assertRegionIsBackOnline(hri);
234 } finally {
235 master.getRegionServerOperationQueue().
236 unregisterRegionServerOperationListener(listener);
237 }
238 */
239 }
240
241 /**
242 * Test adding in a new server before old one on same host+port is dead.
243 * Make the test more onerous by having the server under test carry the meta.
244 * If confusion between old and new, purportedly meta never comes back. Test
245 * that meta gets redeployed.
246 */
247 @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413()
248 throws IOException {
249 /*
250 LOG.info("Running testAddingServerBeforeOldIsDead2413");
251 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
252 int count = count();
253 int metaIndex = cluster.getServerWithMeta();
254 MiniHBaseClusterRegionServer metaHRS =
255 (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex);
256 int port = metaHRS.getServerInfo().getServerAddress().getPort();
257 Configuration c = TEST_UTIL.getConfiguration();
258 String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0");
259 try {
260 LOG.info("KILLED=" + metaHRS);
261 metaHRS.kill();
262 c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port));
263 // Try and start new regionserver. It might clash with the old
264 // regionserver port so keep trying to get past the BindException.
265 HRegionServer hrs = null;
266 while (true) {
267 try {
268 hrs = cluster.startRegionServer().getRegionServer();
269 break;
270 } catch (IOException e) {
271 if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) {
272 InvocationTargetException ee = (InvocationTargetException)e.getCause();
273 if (ee.getCause() != null && ee.getCause() instanceof BindException) {
274 LOG.info("BindException; retrying: " + e.toString());
275 }
276 }
277 }
278 }
279 LOG.info("STARTED=" + hrs);
280 // Wait until he's been given at least 3 regions before we go on to try
281 // and count rows in table.
282 while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100);
283 LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() +
284 " regions");
285 assertEquals(count, count());
286 } finally {
287 c.set(HConstants.REGIONSERVER_PORT, oldPort);
288 }
289 */
290 }
291
292 /**
293 * HBase2482 is about outstanding region openings. If any are outstanding
294 * when a regionserver goes down, then they'll never deploy. They'll be
295 * stuck in the regions-in-transition list for ever. This listener looks
296 * for a region opening HMsg and if its from the server passed on construction,
297 * then we kill it. It also looks out for a close message on the victim
298 * server because that signifies start of the fireworks.
299 */
300 /*
301 static class HBase2482Listener implements RegionServerOperationListener {
302 private final HRegionServer victim;
303 private boolean abortSent = false;
304 // We closed regions on new server.
305 private volatile boolean closed = false;
306 // Copy of regions on new server
307 private final Collection<HRegion> copyOfOnlineRegions;
308 // This is the region that was in transition on the server we aborted. Test
309 // passes if this region comes back online successfully.
310 private HRegionInfo regionToFind;
311
312 HBase2482Listener(final HRegionServer victim) {
313 this.victim = victim;
314 // Copy regions currently open on this server so I can notice when
315 // there is a close.
316 this.copyOfOnlineRegions =
317 this.victim.getCopyOfOnlineRegionsSortedBySize().values();
318 }
319
320 @Override
321 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
322 if (!victim.getServerInfo().equals(serverInfo) ||
323 this.abortSent || !this.closed) {
324 return true;
325 }
326 if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true;
327 // Save the region that is in transition so can test later it came back.
328 this.regionToFind = incomingMsg.getRegionInfo();
329 String msg = "ABORTING " + this.victim + " because got a " +
330 HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " +
331 incomingMsg.getRegionInfo().getRegionNameAsString();
332 this.victim.abort(msg);
333 this.abortSent = true;
334 return true;
335 }
336
337 @Override
338 public boolean process(RegionServerOperation op) throws IOException {
339 return true;
340 }
341
342 @Override
343 public void processed(RegionServerOperation op) {
344 if (this.closed || !(op instanceof ProcessRegionClose)) return;
345 ProcessRegionClose close = (ProcessRegionClose)op;
346 for (HRegion r: this.copyOfOnlineRegions) {
347 if (r.getRegionInfo().equals(close.regionInfo)) {
348 // We've closed one of the regions that was on the victim server.
349 // Now can start testing for when all regions are back online again
350 LOG.info("Found close of " +
351 r.getRegionInfo().getRegionNameAsString() +
352 "; setting close happened flag");
353 this.closed = true;
354 break;
355 }
356 }
357 }
358 }
359 */
360 /**
361 * In 2482, a RS with an opening region on it dies. The said region is then
362 * stuck in the master's regions-in-transition and never leaves it. This
363 * test works by bringing up a new regionserver, waiting for the load
364 * balancer to give it some regions. Then, we close all on the new server.
365 * After sending all the close messages, we send the new regionserver the
366 * special blocking message so it can not process any more messages.
367 * Meantime reopening of the just-closed regions is backed up on the new
368 * server. Soon as master gets an opening region from the new regionserver,
369 * we kill it. We then wait on all regions to come back on line. If bug
370 * is fixed, this should happen soon as the processing of the killed server is
371 * done.
372 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a>
373 */
374 @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482()
375 throws Exception {
376 /*
377 LOG.info("Running testKillRSWithOpeningRegion2482");
378 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
379 if (cluster.getLiveRegionServerThreads().size() < 2) {
380 // Need at least two servers.
381 cluster.startRegionServer();
382 }
383 // Count how many regions are online. They need to be all back online for
384 // this test to succeed.
385 int countOfMetaRegions = countOfMetaRegions();
386 // Add a listener on the server.
387 HMaster m = cluster.getMaster();
388 // Start new regionserver.
389 MiniHBaseClusterRegionServer hrs =
390 (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer();
391 LOG.info("Started new regionserver: " + hrs.toString());
392 // Wait until has some regions before proceeding. Balancer will give it some.
393 int minimumRegions =
394 countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2);
395 while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100);
396 // Set the listener only after some regions have been opened on new server.
397 HBase2482Listener listener = new HBase2482Listener(hrs);
398 m.getRegionServerOperationQueue().
399 registerRegionServerOperationListener(listener);
400 try {
401 // Go close all non-catalog regions on this new server
402 closeAllNonCatalogRegions(cluster, hrs);
403 // After all closes, add blocking message before the region opens start to
404 // come in.
405 cluster.addMessageToSendRegionServer(hrs,
406 new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER));
407 // Wait till one of the above close messages has an effect before we start
408 // wait on all regions back online.
409 while (!listener.closed) Threads.sleep(100);
410 LOG.info("Past close");
411 // Make sure the abort server message was sent.
412 while(!listener.abortSent) Threads.sleep(100);
413 LOG.info("Past abort send; waiting on all regions to redeploy");
414 // Now wait for regions to come back online.
415 assertRegionIsBackOnline(listener.regionToFind);
416 } finally {
417 m.getRegionServerOperationQueue().
418 unregisterRegionServerOperationListener(listener);
419 }
420 */
421 }
422
423 /*
424 * @return Count of all non-catalog regions on the designated server
425 */
426 /*
427 private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster,
428 final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs)
429 throws IOException {
430 int countOfRegions = 0;
431 for (HRegion r: hrs.getOnlineRegions()) {
432 if (r.getRegionInfo().isMetaRegion()) continue;
433 cluster.addMessageToSendRegionServer(hrs,
434 new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo()));
435 LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() +
436 " on " + hrs.toString());
437 countOfRegions++;
438 }
439 return countOfRegions;
440 }
441
442 private void assertRegionIsBackOnline(final HRegionInfo hri)
443 throws IOException {
444 // Region should have an entry in its startkey because of addRowToEachRegion.
445 byte [] row = getStartKey(hri);
446 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
447 Get g = new Get(row);
448 assertTrue((t.get(g)).size() > 0);
449 }
450
451 /*
452 * @return Count of regions in meta table.
453 * @throws IOException
454 */
455 /*
456 private static int countOfMetaRegions()
457 throws IOException {
458 HTable meta = new HTable(TEST_UTIL.getConfiguration(),
459 HConstants.META_TABLE_NAME);
460 int rows = 0;
461 Scan scan = new Scan();
462 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
463 ResultScanner s = meta.getScanner(scan);
464 for (Result r = null; (r = s.next()) != null;) {
465 byte [] b =
466 r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
467 if (b == null || b.length <= 0) break;
468 rows++;
469 }
470 s.close();
471 return rows;
472 }
473 */
474 /*
475 * Add to each of the regions in .META. a value. Key is the startrow of the
476 * region (except its 'aaa' for first region). Actual value is the row name.
477 * @param expected
478 * @return
479 * @throws IOException
480 */
481 private static int addToEachStartKey(final int expected) throws IOException {
482 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
483 HTable meta = new HTable(TEST_UTIL.getConfiguration(),
484 TableName.META_TABLE_NAME);
485 int rows = 0;
486 Scan scan = new Scan();
487 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
488 ResultScanner s = meta.getScanner(scan);
489 for (Result r = null; (r = s.next()) != null;) {
490 HRegionInfo hri = HRegionInfo.getHRegionInfo(r);
491 if (hri == null) break;
492 if (!hri.getTableName().getNameAsString().equals(TABLENAME)) {
493 continue;
494 }
495
496 // If start key, add 'aaa'.
497 if(!hri.getTableName().getNameAsString().equals(TABLENAME)) {
498 continue;
499 }
500 byte [] row = getStartKey(hri);
501 Put p = new Put(row);
502 p.setDurability(Durability.SKIP_WAL);
503 p.add(getTestFamily(), getTestQualifier(), row);
504 t.put(p);
505 rows++;
506 }
507 s.close();
508 Assert.assertEquals(expected, rows);
509 t.close();
510 meta.close();
511 return rows;
512 }
513
514 /*
515 * @return Count of rows in TABLENAME
516 * @throws IOException
517 */
518 private static int count() throws IOException {
519 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
520 int rows = 0;
521 Scan scan = new Scan();
522 ResultScanner s = t.getScanner(scan);
523 for (Result r = null; (r = s.next()) != null;) {
524 rows++;
525 }
526 s.close();
527 LOG.info("Counted=" + rows);
528 t.close();
529 return rows;
530 }
531
532 /*
533 * @param hri
534 * @return Start key for hri (If start key is '', then return 'aaa'.
535 */
536 private static byte [] getStartKey(final HRegionInfo hri) {
537 return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())?
538 Bytes.toBytes("aaa"): hri.getStartKey();
539 }
540
541 private static byte [] getTestFamily() {
542 return FAMILIES[0];
543 }
544
545 private static byte [] getTestQualifier() {
546 return getTestFamily();
547 }
548
549 }
550