1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.util;
20
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.Arrays;
24 import java.util.Collection;
25 import java.util.LinkedList;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.Queue;
29 import java.util.Random;
30
31 import org.apache.commons.cli.CommandLine;
32 import org.apache.commons.logging.Log;
33 import org.apache.commons.logging.LogFactory;
34 import org.apache.hadoop.conf.Configuration;
35 import org.apache.hadoop.hbase.ClusterStatus;
36 import org.apache.hadoop.hbase.HBaseCluster;
37 import org.apache.hadoop.hbase.HBaseConfiguration;
38 import org.apache.hadoop.hbase.HRegionInfo;
39 import org.apache.hadoop.hbase.HServerLoad;
40 import org.apache.hadoop.hbase.IntegrationTestingUtility;
41 import org.apache.hadoop.hbase.IntegrationTestDataIngestWithChaosMonkey;
42 import org.apache.hadoop.hbase.ServerName;
43 import org.apache.hadoop.hbase.Stoppable;
44 import org.apache.hadoop.hbase.client.HBaseAdmin;
45 import org.apache.hadoop.hbase.client.HTable;
46 import org.apache.hadoop.util.StringUtils;
47 import org.apache.hadoop.util.ToolRunner;
48
49 import com.google.common.collect.Lists;
50 import com.google.common.collect.Maps;
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73 public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
74
75 private static final Log LOG = LogFactory.getLog(ChaosMonkey.class);
76
77 private static final long ONE_SEC = 1000;
78 private static final long FIVE_SEC = 5 * ONE_SEC;
79 private static final long ONE_MIN = 60 * ONE_SEC;
80 private static final long TIMEOUT = ONE_MIN;
81
82 final IntegrationTestingUtility util;
83
84
85
86
87
88
89 public ChaosMonkey(IntegrationTestingUtility util, String... policies) {
90 this.util = util;
91 setPoliciesByName(policies);
92 }
93
94
95
96
97
98
99 public ChaosMonkey(IntegrationTestingUtility util, Policy... policies) {
100 this.util = util;
101 this.policies = policies;
102 }
103
104 private void setPoliciesByName(String... policies) {
105 this.policies = new Policy[policies.length];
106 for (int i=0; i < policies.length; i++) {
107 this.policies[i] = NAMED_POLICIES.get(policies[i]);
108 }
109 }
110
111
112
113
114 public static class ActionContext {
115 private IntegrationTestingUtility util;
116
117 public ActionContext(IntegrationTestingUtility util) {
118 this.util = util;
119 }
120
121 public IntegrationTestingUtility getHBaseIntegrationTestingUtility() {
122 return util;
123 }
124
125 public HBaseCluster getHBaseCluster() {
126 return util.getHBaseClusterInterface();
127 }
128 }
129
130
131
132
133 public static class Action {
134
135
136
137
138
139 protected ActionContext context;
140 protected HBaseCluster cluster;
141 protected ClusterStatus initialStatus;
142 protected ServerName[] initialServers;
143
144 public void init(ActionContext context) throws IOException {
145 this.context = context;
146 cluster = context.getHBaseCluster();
147 initialStatus = cluster.getInitialClusterStatus();
148 Collection<ServerName> regionServers = initialStatus.getServers();
149 initialServers = regionServers.toArray(new ServerName[regionServers.size()]);
150 }
151
152 public void perform() throws Exception { };
153
154
155
156 protected ServerName[] getCurrentServers() throws IOException {
157 Collection<ServerName> regionServers = cluster.getClusterStatus().getServers();
158 return regionServers.toArray(new ServerName[regionServers.size()]);
159 }
160
161 protected void killMaster(ServerName server) throws IOException {
162 LOG.info("Killing master:" + server);
163 cluster.killMaster(server);
164 cluster.waitForMasterToStop(server, TIMEOUT);
165 LOG.info("Killed master server:" + server);
166 }
167
168 protected void startMaster(ServerName server) throws IOException {
169 LOG.info("Starting master:" + server.getHostname());
170 cluster.startMaster(server.getHostname());
171 cluster.waitForActiveAndReadyMaster(TIMEOUT);
172 LOG.info("Started master: " + server);
173 }
174
175 protected void killRs(ServerName server) throws IOException {
176 LOG.info("Killing region server:" + server);
177 cluster.killRegionServer(server);
178 cluster.waitForRegionServerToStop(server, TIMEOUT);
179 LOG.info("Killed region server:" + server + ". Reported num of rs:"
180 + cluster.getClusterStatus().getServersSize());
181 }
182
183 protected void startRs(ServerName server) throws IOException {
184 LOG.info("Starting region server:" + server.getHostname());
185 cluster.startRegionServer(server.getHostname());
186 cluster.waitForRegionServerToStart(server.getHostname(), TIMEOUT);
187 LOG.info("Started region server:" + server + ". Reported num of rs:"
188 + cluster.getClusterStatus().getServersSize());
189 }
190 }
191
192 private static class RestartActionBase extends Action {
193 long sleepTime;
194
195 public RestartActionBase(long sleepTime) {
196 this.sleepTime = sleepTime;
197 }
198
199 void sleep(long sleepTime) {
200 LOG.info("Sleeping for:" + sleepTime);
201 Threads.sleep(sleepTime);
202 }
203
204 void restartMaster(ServerName server, long sleepTime) throws IOException {
205 sleepTime = Math.max(sleepTime, 1000);
206 killMaster(server);
207 sleep(sleepTime);
208 startMaster(server);
209 }
210
211 void restartRs(ServerName server, long sleepTime) throws IOException {
212 sleepTime = Math.max(sleepTime, 1000);
213 killRs(server);
214 sleep(sleepTime);
215 startRs(server);
216 }
217 }
218
219 public static class RestartActiveMaster extends RestartActionBase {
220 public RestartActiveMaster(long sleepTime) {
221 super(sleepTime);
222 }
223 @Override
224 public void perform() throws Exception {
225 LOG.info("Performing action: Restart active master");
226
227 ServerName master = cluster.getClusterStatus().getMaster();
228 restartMaster(master, sleepTime);
229 }
230 }
231
232 public static class RestartRandomRs extends RestartActionBase {
233 public RestartRandomRs(long sleepTime) {
234 super(sleepTime);
235 }
236
237 @Override
238 public void perform() throws Exception {
239 LOG.info("Performing action: Restart random region server");
240 ServerName server = selectRandomItem(getCurrentServers());
241
242 restartRs(server, sleepTime);
243 }
244 }
245
246 public static class RestartRsHoldingMeta extends RestartActionBase {
247 public RestartRsHoldingMeta(long sleepTime) {
248 super(sleepTime);
249 }
250 @Override
251 public void perform() throws Exception {
252 LOG.info("Performing action: Restart region server holding META");
253 ServerName server = cluster.getServerHoldingMeta();
254 if (server == null) {
255 LOG.warn("No server is holding .META. right now.");
256 return;
257 }
258 restartRs(server, sleepTime);
259 }
260 }
261
262 public static class RestartRsHoldingRoot extends RestartRandomRs {
263 public RestartRsHoldingRoot(long sleepTime) {
264 super(sleepTime);
265 }
266 @Override
267 public void perform() throws Exception {
268 LOG.info("Performing action: Restart region server holding ROOT");
269 ServerName server = cluster.getServerHoldingRoot();
270 if (server == null) {
271 LOG.warn("No server is holding -ROOT- right now.");
272 return;
273 }
274 restartRs(server, sleepTime);
275 }
276 }
277
278 public static class RestartRsHoldingTable extends RestartActionBase {
279
280 private final String tableName;
281
282 public RestartRsHoldingTable(long sleepTime, String tableName) {
283 super(sleepTime);
284 this.tableName = tableName;
285 }
286
287 @Override
288 public void perform() throws Exception {
289 HTable table = null;
290 Collection<ServerName> serverNames;
291 try {
292 Configuration conf = context.getHBaseIntegrationTestingUtility().getConfiguration();
293 table = new HTable(conf, tableName);
294 serverNames = table.getRegionLocations().values();
295 } catch (IOException e) {
296 LOG.debug("Error creating HTable used to get list of region locations.", e);
297 return;
298 } finally {
299 if (table != null) {
300 table.close();
301 }
302 }
303 Random random = new Random();
304 ServerName[] nameArray = serverNames.toArray(new ServerName[serverNames.size()]);
305 restartRs(nameArray[random.nextInt(nameArray.length)], sleepTime);
306 }
307 }
308
309 public static class MoveRegionsOfTable extends Action {
310 private final long sleepTime;
311 private final byte[] tableNameBytes;
312
313 public MoveRegionsOfTable(long sleepTime, String tableName) {
314 this.sleepTime = sleepTime;
315 this.tableNameBytes = Bytes.toBytes(tableName);
316 }
317
318 @Override
319 public void perform() throws Exception {
320 try {
321 HBaseAdmin admin = this.context.getHBaseIntegrationTestingUtility().getHBaseAdmin();
322 List<HRegionInfo> regions = admin.getTableRegions(tableNameBytes);
323 Collection<ServerName> serversList = admin.getClusterStatus().getServers();
324 ServerName[] servers = serversList.toArray(new ServerName[serversList.size()]);
325 Random random = new Random();
326 for (HRegionInfo regionInfo:regions) {
327 try {
328 byte[] destServerName =
329 Bytes.toBytes(servers[random.nextInt(servers.length)].getServerName());
330 admin.move(regionInfo.getRegionName(), destServerName);
331 } catch (Exception e) {
332 LOG.debug("Error moving region", e);
333 }
334 }
335 Thread.sleep(sleepTime);
336 } catch (Exception e) {
337 LOG.debug("Error performing MoveRegionsOfTable", e);
338 }
339 }
340 }
341
342
343
344
345 public static class BatchRestartRs extends RestartActionBase {
346 float ratio;
347
348 public BatchRestartRs(long sleepTime, float ratio) {
349 super(sleepTime);
350 this.ratio = ratio;
351 }
352
353 @Override
354 public void perform() throws Exception {
355 LOG.info(String.format("Performing action: Batch restarting %d%% of region servers",
356 (int)(ratio * 100)));
357 List<ServerName> selectedServers = selectRandomItems(getCurrentServers(), ratio);
358
359 for (ServerName server : selectedServers) {
360 LOG.info("Killing region server:" + server);
361 cluster.killRegionServer(server);
362 }
363
364 for (ServerName server : selectedServers) {
365 cluster.waitForRegionServerToStop(server, TIMEOUT);
366 }
367
368 LOG.info("Killed " + selectedServers.size() + " region servers. Reported num of rs:"
369 + cluster.getClusterStatus().getServersSize());
370
371 sleep(sleepTime);
372
373 for (ServerName server : selectedServers) {
374 LOG.info("Starting region server:" + server.getHostname());
375 cluster.startRegionServer(server.getHostname());
376
377 }
378 for (ServerName server : selectedServers) {
379 cluster.waitForRegionServerToStart(server.getHostname(), TIMEOUT);
380 }
381 LOG.info("Started " + selectedServers.size() +" region servers. Reported num of rs:"
382 + cluster.getClusterStatus().getServersSize());
383 }
384 }
385
386
387
388
389
390 public static class RollingBatchRestartRs extends BatchRestartRs {
391 public RollingBatchRestartRs(long sleepTime, float ratio) {
392 super(sleepTime, ratio);
393 }
394
395 @Override
396 public void perform() throws Exception {
397 Random random = new Random();
398 LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers",
399 (int)(ratio * 100)));
400 List<ServerName> selectedServers = selectRandomItems(getCurrentServers(), ratio);
401
402 Queue<ServerName> serversToBeKilled = new LinkedList<ServerName>(selectedServers);
403 Queue<ServerName> deadServers = new LinkedList<ServerName>();
404
405
406 while (!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) {
407 boolean action = true;
408
409 if (serversToBeKilled.isEmpty() || deadServers.isEmpty()) {
410 action = deadServers.isEmpty();
411 } else {
412 action = random.nextBoolean();
413 }
414
415 if (action) {
416 ServerName server = serversToBeKilled.remove();
417 killRs(server);
418 deadServers.add(server);
419 } else {
420 ServerName server = deadServers.remove();
421 startRs(server);
422 }
423
424 sleep(random.nextInt((int)sleepTime));
425 }
426 }
427 }
428
429 public static class UnbalanceRegionsAction extends Action {
430 private double fractionOfRegions;
431 private double fractionOfServers;
432 private Random random = new Random();
433
434
435
436
437
438
439
440 public UnbalanceRegionsAction(double fractionOfRegions, double fractionOfServers) {
441 this.fractionOfRegions = fractionOfRegions;
442 this.fractionOfServers = fractionOfServers;
443 }
444
445 @Override
446 public void perform() throws Exception {
447 LOG.info("Unbalancing regions");
448 ClusterStatus status = this.cluster.getClusterStatus();
449 List<ServerName> victimServers = new LinkedList<ServerName>(status.getServers());
450 int targetServerCount = (int)Math.ceil(fractionOfServers * victimServers.size());
451 List<byte[]> targetServers = new ArrayList<byte[]>(targetServerCount);
452 for (int i = 0; i < targetServerCount; ++i) {
453 int victimIx = random.nextInt(victimServers.size());
454 String serverName = victimServers.remove(victimIx).getServerName();
455 targetServers.add(Bytes.toBytes(serverName));
456 }
457
458 List<byte[]> victimRegions = new LinkedList<byte[]>();
459 for (ServerName server : victimServers) {
460 HServerLoad serverLoad = status.getLoad(server);
461
462 List<byte[]> regions = new LinkedList<byte[]>(serverLoad.getRegionsLoad().keySet());
463 int victimRegionCount = (int)Math.ceil(fractionOfRegions * regions.size());
464 LOG.debug("Removing " + victimRegionCount + " regions from " + server.getServerName());
465 for (int i = 0; i < victimRegionCount; ++i) {
466 int victimIx = random.nextInt(regions.size());
467 String regionId = HRegionInfo.encodeRegionName(regions.remove(victimIx));
468 victimRegions.add(Bytes.toBytes(regionId));
469 }
470 }
471
472 LOG.info("Moving " + victimRegions.size() + " regions from " + victimServers.size()
473 + " servers to " + targetServers.size() + " different servers");
474 HBaseAdmin admin = this.context.getHBaseIntegrationTestingUtility().getHBaseAdmin();
475 for (byte[] victimRegion : victimRegions) {
476 int targetIx = random.nextInt(targetServers.size());
477 admin.move(victimRegion, targetServers.get(targetIx));
478 }
479 }
480 }
481
482 public static class ForceBalancerAction extends Action {
483 @Override
484 public void perform() throws Exception {
485 LOG.info("Balancing regions");
486 HBaseAdmin admin = this.context.getHBaseIntegrationTestingUtility().getHBaseAdmin();
487 boolean result = admin.balancer();
488 if (!result) {
489 LOG.error("Balancer didn't succeed");
490 }
491 }
492 }
493
494
495
496
497 public static class PolicyContext extends ActionContext {
498 public PolicyContext(IntegrationTestingUtility util) {
499 super(util);
500 }
501 }
502
503
504
505
506 public static abstract class Policy extends StoppableImplementation implements Runnable {
507 protected PolicyContext context;
508 public void init(PolicyContext context) throws Exception {
509 this.context = context;
510 }
511 }
512
513
514 public static class CompositeSequentialPolicy extends Policy {
515 private List<Policy> policies;
516 public CompositeSequentialPolicy(Policy... policies) {
517 this.policies = Arrays.asList(policies);
518 }
519
520 @Override
521 public void stop(String why) {
522 super.stop(why);
523 for (Policy p : policies) {
524 p.stop(why);
525 }
526 }
527
528 @Override
529 public void run() {
530 for (Policy p : policies) {
531 p.run();
532 }
533 }
534
535 @Override
536 public void init(PolicyContext context) throws Exception {
537 super.init(context);
538 for (Policy p : policies) {
539 p.init(context);
540 }
541 }
542 }
543
544
545 public static abstract class PeriodicPolicy extends Policy {
546 private long periodMs;
547
548 public PeriodicPolicy(long periodMs) {
549 this.periodMs = periodMs;
550 }
551
552 @Override
553 public void run() {
554
555 int jitter = new Random().nextInt((int)periodMs);
556 LOG.info("Sleeping for " + jitter + " to add jitter");
557 Threads.sleep(jitter);
558
559 while (!isStopped()) {
560 long start = System.currentTimeMillis();
561 runOneIteration();
562
563 if (isStopped()) return;
564 long sleepTime = periodMs - (System.currentTimeMillis() - start);
565 if (sleepTime > 0) {
566 LOG.info("Sleeping for: " + sleepTime);
567 Threads.sleep(sleepTime);
568 }
569 }
570 }
571
572 protected abstract void runOneIteration();
573
574 @Override
575 public void init(PolicyContext context) throws Exception {
576 super.init(context);
577 LOG.info("Using ChaosMonkey Policy: " + this.getClass() + ", period: " + periodMs);
578 }
579 }
580
581
582 public static class DoActionsOncePolicy extends PeriodicPolicy {
583 private List<Action> actions;
584
585 public DoActionsOncePolicy(long periodMs, List<Action> actions) {
586 super(periodMs);
587 this.actions = new ArrayList<ChaosMonkey.Action>(actions);
588 }
589
590 public DoActionsOncePolicy(long periodMs, Action... actions) {
591 this(periodMs, Arrays.asList(actions));
592 }
593
594 @Override
595 protected void runOneIteration() {
596 if (actions.isEmpty()) {
597 this.stop("done");
598 return;
599 }
600 Action action = actions.remove(0);
601
602 try {
603 action.perform();
604 } catch (Exception ex) {
605 LOG.warn("Exception occured during performing action: "
606 + StringUtils.stringifyException(ex));
607 }
608 }
609
610 @Override
611 public void init(PolicyContext context) throws Exception {
612 super.init(context);
613 for (Action action : actions) {
614 action.init(this.context);
615 }
616 }
617 }
618
619
620
621
622
623 public static class PeriodicRandomActionPolicy extends PeriodicPolicy {
624 private List<Pair<Action, Integer>> actions;
625
626 public PeriodicRandomActionPolicy(long periodMs, List<Pair<Action, Integer>> actions) {
627 super(periodMs);
628 this.actions = actions;
629 }
630
631 public PeriodicRandomActionPolicy(long periodMs, Pair<Action, Integer>... actions) {
632
633 this(periodMs, Arrays.asList(actions));
634 }
635
636 public PeriodicRandomActionPolicy(long periodMs, Action... actions) {
637 super(periodMs);
638 this.actions = new ArrayList<Pair<Action, Integer>>(actions.length);
639 for (Action action : actions) {
640 this.actions.add(new Pair<Action, Integer>(action, 1));
641 }
642 }
643
644 @Override
645 protected void runOneIteration() {
646 Action action = selectWeightedRandomItem(actions);
647 try {
648 action.perform();
649 } catch (Exception ex) {
650 LOG.warn("Exception occured during performing action: "
651 + StringUtils.stringifyException(ex));
652 }
653 }
654
655 @Override
656 public void init(PolicyContext context) throws Exception {
657 super.init(context);
658 for (Pair<Action, Integer> action : actions) {
659 action.getFirst().init(this.context);
660 }
661 }
662 }
663
664
665 static <T> T selectRandomItem(T[] items) {
666 Random random = new Random();
667 return items[random.nextInt(items.length)];
668 }
669
670
671 static <T> T selectWeightedRandomItem(List<Pair<T, Integer>> items) {
672 Random random = new Random();
673 int totalWeight = 0;
674 for (Pair<T, Integer> pair : items) {
675 totalWeight += pair.getSecond();
676 }
677
678 int cutoff = random.nextInt(totalWeight);
679 int cummulative = 0;
680 T item = null;
681
682
683 for (int i=0; i<items.size(); i++) {
684 int curWeight = items.get(i).getSecond();
685 if ( cutoff < cummulative + curWeight) {
686 item = items.get(i).getFirst();
687 break;
688 }
689 cummulative += curWeight;
690 }
691
692 return item;
693 }
694
695
696 static <T> List<T> selectRandomItems(T[] items, float ratio) {
697 Random random = new Random();
698 int remaining = (int)Math.ceil(items.length * ratio);
699
700 List<T> selectedItems = new ArrayList<T>(remaining);
701
702 for (int i=0; i<items.length && remaining > 0; i++) {
703 if (random.nextFloat() < ((float)remaining/(items.length-i))) {
704 selectedItems.add(items[i]);
705 remaining--;
706 }
707 }
708
709 return selectedItems;
710 }
711
712
713
714
715
716
717
718
719
720
721
722 @SuppressWarnings("unchecked")
723 private static final List<Pair<Action, Integer>> ALL_ACTIONS = Lists.newArrayList(
724 new Pair<Action,Integer>(new RestartActiveMaster(FIVE_SEC), 2),
725 new Pair<Action,Integer>(new RestartRandomRs(FIVE_SEC), 2),
726 new Pair<Action,Integer>(new RestartRandomRs(ONE_MIN), 2),
727 new Pair<Action,Integer>(new RestartRsHoldingMeta(FIVE_SEC), 1),
728 new Pair<Action,Integer>(new RestartRsHoldingRoot(FIVE_SEC), 1),
729 new Pair<Action,Integer>(new BatchRestartRs(FIVE_SEC, 0.5f), 2),
730 new Pair<Action,Integer>(new RollingBatchRestartRs(FIVE_SEC, 1.0f), 2)
731 );
732
733 public static final String EVERY_MINUTE_RANDOM_ACTION_POLICY = "EVERY_MINUTE_RANDOM_ACTION_POLICY";
734
735 private Policy[] policies;
736 private Thread[] monkeyThreads;
737
738 public void start() throws Exception {
739 monkeyThreads = new Thread[policies.length];
740
741 for (int i=0; i<policies.length; i++) {
742 policies[i].init(new PolicyContext(this.util));
743 Thread monkeyThread = new Thread(policies[i]);
744 monkeyThread.start();
745 monkeyThreads[i] = monkeyThread;
746 }
747 }
748
749 @Override
750 public void stop(String why) {
751 for (Policy policy : policies) {
752 policy.stop(why);
753 }
754 }
755
756 @Override
757 public boolean isStopped() {
758 return policies[0].isStopped();
759 }
760
761
762
763
764
765 public void waitForStop() throws InterruptedException {
766 for (Thread monkeyThread : monkeyThreads) {
767 monkeyThread.join();
768 }
769 }
770
771 private static final Map<String, Policy> NAMED_POLICIES = Maps.newHashMap();
772 static {
773 NAMED_POLICIES.put(EVERY_MINUTE_RANDOM_ACTION_POLICY,
774 new PeriodicRandomActionPolicy(ONE_MIN, ALL_ACTIONS));
775 }
776
777 @Override
778 protected void addOptions() {
779 addOptWithArg("policy", "a named policy defined in ChaosMonkey.java. Possible values: "
780 + NAMED_POLICIES.keySet());
781
782 }
783
784 @Override
785 protected void processOptions(CommandLine cmd) {
786 String[] policies = cmd.getOptionValues("policy");
787 if (policies != null) {
788 setPoliciesByName(policies);
789 }
790 }
791
792 @Override
793 protected int doWork() throws Exception {
794 start();
795 waitForStop();
796 return 0;
797 }
798
799 public static void main(String[] args) throws Exception {
800 Configuration conf = HBaseConfiguration.create();
801 IntegrationTestingUtility.setUseDistributedCluster(conf);
802 IntegrationTestingUtility util = new IntegrationTestingUtility(conf);
803 util.initializeCluster(1);
804
805 ChaosMonkey monkey = new ChaosMonkey(util, EVERY_MINUTE_RANDOM_ACTION_POLICY);
806 int ret = ToolRunner.run(conf, monkey, args);
807 System.exit(ret);
808 }
809
810 }