001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.mapreduce.tools;
019    
020    import java.io.IOException;
021    import java.util.ArrayList;
022    import java.util.List;
023    
024    import org.apache.commons.logging.Log;
025    import org.apache.commons.logging.LogFactory;
026    import org.apache.hadoop.classification.InterfaceAudience;
027    import org.apache.hadoop.classification.InterfaceStability;
028    import org.apache.hadoop.conf.Configuration;
029    import org.apache.hadoop.conf.Configured;
030    import org.apache.hadoop.ipc.RemoteException;
031    import org.apache.hadoop.mapred.JobConf;
032    import org.apache.hadoop.mapred.TIPStatus;
033    import org.apache.hadoop.mapreduce.Cluster;
034    import org.apache.hadoop.mapreduce.Counters;
035    import org.apache.hadoop.mapreduce.Job;
036    import org.apache.hadoop.mapreduce.JobID;
037    import org.apache.hadoop.mapreduce.JobPriority;
038    import org.apache.hadoop.mapreduce.JobStatus;
039    import org.apache.hadoop.mapreduce.TaskAttemptID;
040    import org.apache.hadoop.mapreduce.TaskCompletionEvent;
041    import org.apache.hadoop.mapreduce.TaskReport;
042    import org.apache.hadoop.mapreduce.TaskTrackerInfo;
043    import org.apache.hadoop.mapreduce.TaskType;
044    import org.apache.hadoop.mapreduce.jobhistory.HistoryViewer;
045    import org.apache.hadoop.mapreduce.v2.LogParams;
046    import org.apache.hadoop.security.AccessControlException;
047    import org.apache.hadoop.util.Tool;
048    import org.apache.hadoop.util.ToolRunner;
049    import org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.LogDumper;
050    
051    /**
052     * Interprets the map reduce cli options 
053     */
054    @InterfaceAudience.Public
055    @InterfaceStability.Stable
056    public class CLI extends Configured implements Tool {
057      private static final Log LOG = LogFactory.getLog(CLI.class);
058      private Cluster cluster;
059    
060      public CLI() {
061      }
062      
063      public CLI(Configuration conf) {
064        setConf(conf);
065      }
066      
067      public int run(String[] argv) throws Exception {
068        int exitCode = -1;
069        if (argv.length < 1) {
070          displayUsage("");
071          return exitCode;
072        }    
073        // process arguments
074        String cmd = argv[0];
075        String submitJobFile = null;
076        String jobid = null;
077        String taskid = null;
078        String historyFile = null;
079        String counterGroupName = null;
080        String counterName = null;
081        JobPriority jp = null;
082        String taskType = null;
083        String taskState = null;
084        int fromEvent = 0;
085        int nEvents = 0;
086        boolean getStatus = false;
087        boolean getCounter = false;
088        boolean killJob = false;
089        boolean listEvents = false;
090        boolean viewHistory = false;
091        boolean viewAllHistory = false;
092        boolean listJobs = false;
093        boolean listAllJobs = false;
094        boolean listActiveTrackers = false;
095        boolean listBlacklistedTrackers = false;
096        boolean displayTasks = false;
097        boolean killTask = false;
098        boolean failTask = false;
099        boolean setJobPriority = false;
100        boolean logs = false;
101    
102        if ("-submit".equals(cmd)) {
103          if (argv.length != 2) {
104            displayUsage(cmd);
105            return exitCode;
106          }
107          submitJobFile = argv[1];
108        } else if ("-status".equals(cmd)) {
109          if (argv.length != 2) {
110            displayUsage(cmd);
111            return exitCode;
112          }
113          jobid = argv[1];
114          getStatus = true;
115        } else if("-counter".equals(cmd)) {
116          if (argv.length != 4) {
117            displayUsage(cmd);
118            return exitCode;
119          }
120          getCounter = true;
121          jobid = argv[1];
122          counterGroupName = argv[2];
123          counterName = argv[3];
124        } else if ("-kill".equals(cmd)) {
125          if (argv.length != 2) {
126            displayUsage(cmd);
127            return exitCode;
128          }
129          jobid = argv[1];
130          killJob = true;
131        } else if ("-set-priority".equals(cmd)) {
132          if (argv.length != 3) {
133            displayUsage(cmd);
134            return exitCode;
135          }
136          jobid = argv[1];
137          try {
138            jp = JobPriority.valueOf(argv[2]); 
139          } catch (IllegalArgumentException iae) {
140            LOG.info(iae);
141            displayUsage(cmd);
142            return exitCode;
143          }
144          setJobPriority = true; 
145        } else if ("-events".equals(cmd)) {
146          if (argv.length != 4) {
147            displayUsage(cmd);
148            return exitCode;
149          }
150          jobid = argv[1];
151          fromEvent = Integer.parseInt(argv[2]);
152          nEvents = Integer.parseInt(argv[3]);
153          listEvents = true;
154        } else if ("-history".equals(cmd)) {
155          if (argv.length != 2 && !(argv.length == 3 && "all".equals(argv[1]))) {
156             displayUsage(cmd);
157             return exitCode;
158          }
159          viewHistory = true;
160          if (argv.length == 3 && "all".equals(argv[1])) {
161            viewAllHistory = true;
162            historyFile = argv[2];
163          } else {
164            historyFile = argv[1];
165          }
166        } else if ("-list".equals(cmd)) {
167          if (argv.length != 1 && !(argv.length == 2 && "all".equals(argv[1]))) {
168            displayUsage(cmd);
169            return exitCode;
170          }
171          if (argv.length == 2 && "all".equals(argv[1])) {
172            listAllJobs = true;
173          } else {
174            listJobs = true;
175          }
176        } else if("-kill-task".equals(cmd)) {
177          if (argv.length != 2) {
178            displayUsage(cmd);
179            return exitCode;
180          }
181          killTask = true;
182          taskid = argv[1];
183        } else if("-fail-task".equals(cmd)) {
184          if (argv.length != 2) {
185            displayUsage(cmd);
186            return exitCode;
187          }
188          failTask = true;
189          taskid = argv[1];
190        } else if ("-list-active-trackers".equals(cmd)) {
191          if (argv.length != 1) {
192            displayUsage(cmd);
193            return exitCode;
194          }
195          listActiveTrackers = true;
196        } else if ("-list-blacklisted-trackers".equals(cmd)) {
197          if (argv.length != 1) {
198            displayUsage(cmd);
199            return exitCode;
200          }
201          listBlacklistedTrackers = true;
202        } else if ("-list-attempt-ids".equals(cmd)) {
203          if (argv.length != 4) {
204            displayUsage(cmd);
205            return exitCode;
206          }
207          jobid = argv[1];
208          taskType = argv[2];
209          taskState = argv[3];
210          displayTasks = true;
211        } else if ("-logs".equals(cmd)) {
212          if (argv.length == 2 || argv.length ==3) {
213            logs = true;
214            jobid = argv[1];
215            if (argv.length == 3) {
216              taskid = argv[2];
217            }  else {
218              taskid = null;
219            }
220          } else {
221            displayUsage(cmd);
222            return exitCode;
223          }
224        } else {
225          displayUsage(cmd);
226          return exitCode;
227        }
228    
229        // initialize cluster
230        cluster = new Cluster(getConf());
231            
232        // Submit the request
233        try {
234          if (submitJobFile != null) {
235            Job job = Job.getInstance(new JobConf(submitJobFile));
236            job.submit();
237            System.out.println("Created job " + job.getJobID());
238            exitCode = 0;
239          } else if (getStatus) {
240            Job job = cluster.getJob(JobID.forName(jobid));
241            if (job == null) {
242              System.out.println("Could not find job " + jobid);
243            } else {
244              Counters counters = job.getCounters();
245              System.out.println();
246              System.out.println(job);
247              if (counters != null) {
248                System.out.println(counters);
249              } else {
250                System.out.println("Counters not available. Job is retired.");
251              }
252              exitCode = 0;
253            }
254          } else if (getCounter) {
255            Job job = cluster.getJob(JobID.forName(jobid));
256            if (job == null) {
257              System.out.println("Could not find job " + jobid);
258            } else {
259              Counters counters = job.getCounters();
260              if (counters == null) {
261                System.out.println("Counters not available for retired job " + 
262                jobid);
263                exitCode = -1;
264              } else {
265                System.out.println(getCounter(counters,
266                  counterGroupName, counterName));
267                exitCode = 0;
268              }
269            }
270          } else if (killJob) {
271            Job job = cluster.getJob(JobID.forName(jobid));
272            if (job == null) {
273              System.out.println("Could not find job " + jobid);
274            } else {
275              job.killJob();
276              System.out.println("Killed job " + jobid);
277              exitCode = 0;
278            }
279          } else if (setJobPriority) {
280            Job job = cluster.getJob(JobID.forName(jobid));
281            if (job == null) {
282              System.out.println("Could not find job " + jobid);
283            } else {
284              job.setPriority(jp);
285              System.out.println("Changed job priority.");
286              exitCode = 0;
287            } 
288          } else if (viewHistory) {
289            viewHistory(historyFile, viewAllHistory);
290            exitCode = 0;
291          } else if (listEvents) {
292            listEvents(cluster.getJob(JobID.forName(jobid)), fromEvent, nEvents);
293            exitCode = 0;
294          } else if (listJobs) {
295            listJobs(cluster);
296            exitCode = 0;
297          } else if (listAllJobs) {
298            listAllJobs(cluster);
299            exitCode = 0;
300          } else if (listActiveTrackers) {
301            listActiveTrackers(cluster);
302            exitCode = 0;
303          } else if (listBlacklistedTrackers) {
304            listBlacklistedTrackers(cluster);
305            exitCode = 0;
306          } else if (displayTasks) {
307            displayTasks(cluster.getJob(JobID.forName(jobid)), taskType, taskState);
308          } else if(killTask) {
309            TaskAttemptID taskID = TaskAttemptID.forName(taskid);
310            Job job = cluster.getJob(taskID.getJobID());
311            if (job == null) {
312              System.out.println("Could not find job " + jobid);
313            } else if (job.killTask(taskID)) {
314              System.out.println("Killed task " + taskid);
315              exitCode = 0;
316            } else {
317              System.out.println("Could not kill task " + taskid);
318              exitCode = -1;
319            }
320          } else if(failTask) {
321            TaskAttemptID taskID = TaskAttemptID.forName(taskid);
322            Job job = cluster.getJob(taskID.getJobID());
323            if (job == null) {
324                System.out.println("Could not find job " + jobid);
325            } else if(job.failTask(taskID)) {
326              System.out.println("Killed task " + taskID + " by failing it");
327              exitCode = 0;
328            } else {
329              System.out.println("Could not fail task " + taskid);
330              exitCode = -1;
331            }
332          } else if (logs) {
333            try {
334            JobID jobID = JobID.forName(jobid);
335            TaskAttemptID taskAttemptID = TaskAttemptID.forName(taskid);
336            LogParams logParams = cluster.getLogParams(jobID, taskAttemptID);
337            LogDumper logDumper = new LogDumper();
338            logDumper.setConf(getConf());
339            logDumper.dumpAContainersLogs(logParams.getApplicationId(),
340                logParams.getContainerId(), logParams.getNodeId(),
341                logParams.getOwner());
342            } catch (IOException e) {
343              if (e instanceof RemoteException) {
344                throw e;
345              } 
346              System.out.println(e.getMessage());
347            }
348          }
349        } catch (RemoteException re) {
350          IOException unwrappedException = re.unwrapRemoteException();
351          if (unwrappedException instanceof AccessControlException) {
352            System.out.println(unwrappedException.getMessage());
353          } else {
354            throw re;
355          }
356        } finally {
357          cluster.close();
358        }
359        return exitCode;
360      }
361    
362      private String getJobPriorityNames() {
363        StringBuffer sb = new StringBuffer();
364        for (JobPriority p : JobPriority.values()) {
365          sb.append(p.name()).append(" ");
366        }
367        return sb.substring(0, sb.length()-1);
368      }
369    
370      private String getTaskTypess() {
371        StringBuffer sb = new StringBuffer();
372        for (TaskType t : TaskType.values()) {
373          sb.append(t.name()).append(" ");
374        }
375        return sb.substring(0, sb.length()-1);
376      }
377    
378      /**
379       * Display usage of the command-line tool and terminate execution.
380       */
381      private void displayUsage(String cmd) {
382        String prefix = "Usage: CLI ";
383        String jobPriorityValues = getJobPriorityNames();
384        String taskTypes = getTaskTypess();
385        String taskStates = "running, completed";
386        if ("-submit".equals(cmd)) {
387          System.err.println(prefix + "[" + cmd + " <job-file>]");
388        } else if ("-status".equals(cmd) || "-kill".equals(cmd)) {
389          System.err.println(prefix + "[" + cmd + " <job-id>]");
390        } else if ("-counter".equals(cmd)) {
391          System.err.println(prefix + "[" + cmd + 
392            " <job-id> <group-name> <counter-name>]");
393        } else if ("-events".equals(cmd)) {
394          System.err.println(prefix + "[" + cmd + 
395            " <job-id> <from-event-#> <#-of-events>]. Event #s start from 1.");
396        } else if ("-history".equals(cmd)) {
397          System.err.println(prefix + "[" + cmd + " <jobHistoryFile>]");
398        } else if ("-list".equals(cmd)) {
399          System.err.println(prefix + "[" + cmd + " [all]]");
400        } else if ("-kill-task".equals(cmd) || "-fail-task".equals(cmd)) {
401          System.err.println(prefix + "[" + cmd + " <task-attempt-id>]");
402        } else if ("-set-priority".equals(cmd)) {
403          System.err.println(prefix + "[" + cmd + " <job-id> <priority>]. " +
404              "Valid values for priorities are: " 
405              + jobPriorityValues); 
406        } else if ("-list-active-trackers".equals(cmd)) {
407          System.err.println(prefix + "[" + cmd + "]");
408        } else if ("-list-blacklisted-trackers".equals(cmd)) {
409          System.err.println(prefix + "[" + cmd + "]");
410        } else if ("-list-attempt-ids".equals(cmd)) {
411          System.err.println(prefix + "[" + cmd + 
412              " <job-id> <task-type> <task-state>]. " +
413              "Valid values for <task-type> are " + taskTypes + ". " +
414              "Valid values for <task-state> are " + taskStates);
415        } else if ("-logs".equals(cmd)) {
416          System.err.println(prefix + "[" + cmd +
417              " <job-id> <task-attempt-id>]. " +
418              " <task-attempt-id> is optional to get task attempt logs.");      
419        } else {
420          System.err.printf(prefix + "<command> <args>\n");
421          System.err.printf("\t[-submit <job-file>]\n");
422          System.err.printf("\t[-status <job-id>]\n");
423          System.err.printf("\t[-counter <job-id> <group-name> <counter-name>]\n");
424          System.err.printf("\t[-kill <job-id>]\n");
425          System.err.printf("\t[-set-priority <job-id> <priority>]. " +
426            "Valid values for priorities are: " + jobPriorityValues + "\n");
427          System.err.printf("\t[-events <job-id> <from-event-#> <#-of-events>]\n");
428          System.err.printf("\t[-history <jobHistoryFile>]\n");
429          System.err.printf("\t[-list [all]]\n");
430          System.err.printf("\t[-list-active-trackers]\n");
431          System.err.printf("\t[-list-blacklisted-trackers]\n");
432          System.err.println("\t[-list-attempt-ids <job-id> <task-type> " +
433            "<task-state>]. " +
434            "Valid values for <task-type> are " + taskTypes + ". " +
435            "Valid values for <task-state> are " + taskStates);
436          System.err.printf("\t[-kill-task <task-attempt-id>]\n");
437          System.err.printf("\t[-fail-task <task-attempt-id>]\n");
438          System.err.printf("\t[-logs <job-id> <task-attempt-id>]\n\n");
439          ToolRunner.printGenericCommandUsage(System.out);
440        }
441      }
442        
443      private void viewHistory(String historyFile, boolean all) 
444        throws IOException {
445        HistoryViewer historyViewer = new HistoryViewer(historyFile,
446                                            getConf(), all);
447        historyViewer.print();
448      }
449    
450      protected long getCounter(Counters counters, String counterGroupName,
451          String counterName) throws IOException {
452        return counters.findCounter(counterGroupName, counterName).getValue();
453      }
454      
455      /**
456       * List the events for the given job
457       * @param jobId the job id for the job's events to list
458       * @throws IOException
459       */
460      private void listEvents(Job job, int fromEventId, int numEvents)
461          throws IOException, InterruptedException {
462        TaskCompletionEvent[] events = job.
463          getTaskCompletionEvents(fromEventId, numEvents);
464        System.out.println("Task completion events for " + job.getJobID());
465        System.out.println("Number of events (from " + fromEventId + ") are: " 
466          + events.length);
467        for(TaskCompletionEvent event: events) {
468          System.out.println(event.getStatus() + " " + 
469            event.getTaskAttemptId() + " " + 
470            getTaskLogURL(event.getTaskAttemptId(), event.getTaskTrackerHttp()));
471        }
472      }
473    
474      protected static String getTaskLogURL(TaskAttemptID taskId, String baseUrl) {
475        return (baseUrl + "/tasklog?plaintext=true&attemptid=" + taskId); 
476      }
477      
478    
479      /**
480       * Dump a list of currently running jobs
481       * @throws IOException
482       */
483      private void listJobs(Cluster cluster) 
484          throws IOException, InterruptedException {
485        List<JobStatus> runningJobs = new ArrayList<JobStatus>();
486        for (JobStatus job : cluster.getAllJobStatuses()) {
487          if (!job.isJobComplete()) {
488            runningJobs.add(job);
489          }
490        }
491        displayJobList(runningJobs.toArray(new JobStatus[0]));
492      }
493        
494      /**
495       * Dump a list of all jobs submitted.
496       * @throws IOException
497       */
498      private void listAllJobs(Cluster cluster) 
499          throws IOException, InterruptedException {
500        displayJobList(cluster.getAllJobStatuses());
501      }
502      
503      /**
504       * Display the list of active trackers
505       */
506      private void listActiveTrackers(Cluster cluster) 
507          throws IOException, InterruptedException {
508        TaskTrackerInfo[] trackers = cluster.getActiveTaskTrackers();
509        for (TaskTrackerInfo tracker : trackers) {
510          System.out.println(tracker.getTaskTrackerName());
511        }
512      }
513    
514      /**
515       * Display the list of blacklisted trackers
516       */
517      private void listBlacklistedTrackers(Cluster cluster) 
518          throws IOException, InterruptedException {
519        TaskTrackerInfo[] trackers = cluster.getBlackListedTaskTrackers();
520        if (trackers.length > 0) {
521          System.out.println("BlackListedNode \t Reason");
522        }
523        for (TaskTrackerInfo tracker : trackers) {
524          System.out.println(tracker.getTaskTrackerName() + "\t" + 
525            tracker.getReasonForBlacklist());
526        }
527      }
528    
529      private void printTaskAttempts(TaskReport report) {
530        if (report.getCurrentStatus() == TIPStatus.COMPLETE) {
531          System.out.println(report.getSuccessfulTaskAttemptId());
532        } else if (report.getCurrentStatus() == TIPStatus.RUNNING) {
533          for (TaskAttemptID t : 
534            report.getRunningTaskAttemptIds()) {
535            System.out.println(t);
536          }
537        }
538      }
539    
540      /**
541       * Display the information about a job's tasks, of a particular type and
542       * in a particular state
543       * 
544       * @param job the job
545       * @param type the type of the task (map/reduce/setup/cleanup)
546       * @param state the state of the task 
547       * (pending/running/completed/failed/killed)
548       */
549      protected void displayTasks(Job job, String type, String state) 
550      throws IOException, InterruptedException {
551        TaskReport[] reports = job.getTaskReports(TaskType.valueOf(type));
552        for (TaskReport report : reports) {
553          TIPStatus status = report.getCurrentStatus();
554          if ((state.equals("pending") && status ==TIPStatus.PENDING) ||
555              (state.equals("running") && status ==TIPStatus.RUNNING) ||
556              (state.equals("completed") && status == TIPStatus.COMPLETE) ||
557              (state.equals("failed") && status == TIPStatus.FAILED) ||
558              (state.equals("killed") && status == TIPStatus.KILLED)) {
559            printTaskAttempts(report);
560          }
561        }
562      }
563      
564      public void displayJobList(JobStatus[] jobs) 
565          throws IOException, InterruptedException {
566        System.out.println("Total jobs:" + jobs.length);
567        System.out.println("JobId\tState\tStartTime\t" +
568            "UserName\tQueue\tPriority\tMaps\tReduces\tUsedContainers\t" +
569            "RsvdContainers\tUsedMem\tRsvdMem\tNeededMem\tAM info");
570        for (JobStatus job : jobs) {
571          TaskReport[] mapReports =
572                     cluster.getJob(job.getJobID()).getTaskReports(TaskType.MAP);
573          TaskReport[] reduceReports =
574                     cluster.getJob(job.getJobID()).getTaskReports(TaskType.REDUCE);
575    
576          System.out.printf("%s\t%s\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%dM\t%dM\t%dM\t%s\n",
577              job.getJobID().toString(), job.getState(), job.getStartTime(),
578              job.getUsername(), job.getQueue(), 
579              job.getPriority().name(),
580              mapReports.length,
581              reduceReports.length,
582              job.getNumUsedSlots(),
583              job.getNumReservedSlots(),
584              job.getUsedMem(),
585              job.getReservedMem(),
586              job.getNeededMem(),
587              job.getSchedulingInfo());
588        }
589      }
590      
591      public static void main(String[] argv) throws Exception {
592        int res = ToolRunner.run(new CLI(), argv);
593        System.exit(res);
594      }
595    }