001 /**
002 * Copyright (c) 2010 Yahoo! Inc. All rights reserved.
003 * Licensed under the Apache License, Version 2.0 (the "License");
004 * you may not use this file except in compliance with the License.
005 * You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software
010 * distributed under the License is distributed on an "AS IS" BASIS,
011 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
012 * See the License for the specific language governing permissions and
013 * limitations under the License. See accompanying LICENSE file.
014 */
015 package org.apache.oozie.command.wf;
016
017 import java.util.HashSet;
018 import java.util.Set;
019 import java.io.IOException;
020 import java.util.Collection;
021 import java.util.HashMap;
022 import java.util.List;
023 import java.util.Map;
024
025 import org.apache.hadoop.conf.Configuration;
026 import org.apache.hadoop.fs.FileSystem;
027 import org.apache.hadoop.fs.Path;
028 import org.apache.oozie.client.WorkflowAction;
029 import org.apache.oozie.client.WorkflowJob;
030 import org.apache.oozie.client.OozieClient;
031 import org.apache.oozie.WorkflowActionBean;
032 import org.apache.oozie.WorkflowJobBean;
033 import org.apache.oozie.ErrorCode;
034 import org.apache.oozie.service.HadoopAccessorException;
035 import org.apache.oozie.service.WorkflowAppService;
036 import org.apache.oozie.service.Services;
037 import org.apache.oozie.service.DagXLogInfoService;
038 import org.apache.oozie.service.WorkflowStoreService;
039 import org.apache.oozie.service.HadoopAccessorService;
040 import org.apache.oozie.util.ParamChecker;
041 import org.apache.oozie.util.PropertiesUtils;
042 import org.apache.oozie.util.XLog;
043 import org.apache.oozie.util.XConfiguration;
044 import org.apache.oozie.util.XmlUtils;
045 import org.apache.oozie.command.Command;
046 import org.apache.oozie.command.CommandException;
047 import org.apache.oozie.store.StoreException;
048 import org.apache.oozie.store.WorkflowStore;
049 import org.apache.oozie.workflow.WorkflowApp;
050 import org.apache.oozie.workflow.WorkflowException;
051 import org.apache.oozie.workflow.WorkflowInstance;
052 import org.apache.oozie.workflow.WorkflowLib;
053 import org.apache.oozie.workflow.lite.NodeHandler;
054
055 public class ReRunCommand extends WorkflowCommand<Void> {
056
057 private String jobId;
058 private Configuration conf;
059 private String authToken;
060 private Set<String> nodesToSkip = new HashSet<String>();
061 public static final String TO_SKIP = "TO_SKIP";
062
063 private static final Set<String> DISALLOWED_DEFAULT_PROPERTIES = new HashSet<String>();
064 private static final Set<String> DISALLOWED_USER_PROPERTIES = new HashSet<String>();
065
066 static {
067 String[] badUserProps = {PropertiesUtils.DAYS, PropertiesUtils.HOURS, PropertiesUtils.MINUTES,
068 PropertiesUtils.KB, PropertiesUtils.MB, PropertiesUtils.GB, PropertiesUtils.TB, PropertiesUtils.PB,
069 PropertiesUtils.RECORDS, PropertiesUtils.MAP_IN, PropertiesUtils.MAP_OUT, PropertiesUtils.REDUCE_IN,
070 PropertiesUtils.REDUCE_OUT, PropertiesUtils.GROUPS};
071 PropertiesUtils.createPropertySet(badUserProps, DISALLOWED_USER_PROPERTIES);
072
073 String[] badDefaultProps = {PropertiesUtils.HADOOP_USER, PropertiesUtils.HADOOP_UGI,
074 WorkflowAppService.HADOOP_JT_KERBEROS_NAME, WorkflowAppService.HADOOP_NN_KERBEROS_NAME};
075 PropertiesUtils.createPropertySet(badUserProps, DISALLOWED_DEFAULT_PROPERTIES);
076 PropertiesUtils.createPropertySet(badDefaultProps, DISALLOWED_DEFAULT_PROPERTIES);
077 }
078
079 public ReRunCommand(String jobId, Configuration conf, String authToken) {
080 super("rerun", "rerun", 1, XLog.STD);
081 this.jobId = ParamChecker.notEmpty(jobId, "jobId");
082 this.conf = ParamChecker.notNull(conf, "conf");
083 this.authToken = ParamChecker.notEmpty(authToken, "authToken");
084 }
085
086 /**
087 * Checks the pre-conditions that are required for workflow to recover - Last run of Workflow should be completed -
088 * The nodes that are to be skipped are to be completed successfully in the base run.
089 *
090 * @param wfBean Workflow bean
091 * @param actions List of actions of Workflow
092 * @throws org.apache.oozie.command.CommandException On failure of pre-conditions
093 */
094 private void checkPreConditions(WorkflowJobBean wfBean, List<WorkflowActionBean> actions) throws CommandException {
095 if (!(wfBean.getStatus().equals(WorkflowJob.Status.FAILED)
096 || wfBean.getStatus().equals(WorkflowJob.Status.KILLED) || wfBean.getStatus().equals(
097 WorkflowJob.Status.SUCCEEDED))) {
098 throw new CommandException(ErrorCode.E0805, wfBean.getStatus());
099 }
100 Set<String> unmachedNodes = new HashSet<String>(nodesToSkip);
101 for (WorkflowActionBean action : actions) {
102 if (nodesToSkip.contains(action.getName())) {
103 if (!action.getStatus().equals(WorkflowAction.Status.OK)
104 && !action.getStatus().equals(WorkflowAction.Status.ERROR)) {
105 throw new CommandException(ErrorCode.E0806, action.getName());
106 }
107 unmachedNodes.remove(action.getName());
108 }
109 }
110 if (unmachedNodes.size() > 0) {
111 StringBuilder sb = new StringBuilder();
112 String separator = "";
113 for (String s : unmachedNodes) {
114 sb.append(separator).append(s);
115 separator = ",";
116 }
117 throw new CommandException(ErrorCode.E0807, sb);
118 }
119 }
120
121 /**
122 * Parses the config and adds the nodes that are to be skipped to the skipped node list
123 */
124 private void parseSkippedNodeConf() {
125 if (conf != null) {
126 Collection<String> skipNodes = conf.getStringCollection(OozieClient.RERUN_SKIP_NODES);
127 for (String str : skipNodes) {
128 // trimming is required
129 nodesToSkip.add(str.trim());
130 }
131 }
132 }
133
134 protected Void call(WorkflowStore store) throws StoreException, CommandException {
135 incrJobCounter(1);
136 WorkflowJobBean wfBean = store.getWorkflow(jobId, false);
137 setLogInfo(wfBean);
138 List<WorkflowActionBean> actions = store.getActionsForWorkflow(jobId, false);
139 WorkflowInstance oldWfInstance = wfBean.getWorkflowInstance();
140 WorkflowInstance newWfInstance;
141 XLog log = XLog.getLog(getClass());
142 parseSkippedNodeConf();
143 checkPreConditions(wfBean, actions);
144
145 WorkflowAppService wps = Services.get().get(WorkflowAppService.class);
146 try {
147 XLog.Info.get().setParameter(DagXLogInfoService.TOKEN, conf.get(OozieClient.LOG_TOKEN));
148 WorkflowApp app = wps.parseDef(conf, authToken);
149 XConfiguration protoActionConf = wps.createProtoActionConf(conf, authToken);
150 WorkflowLib workflowLib = Services.get().get(WorkflowStoreService.class).getWorkflowLibWithNoDB();
151
152 Path configDefault = new Path(conf.get(OozieClient.APP_PATH), SubmitCommand.CONFIG_DEFAULT);
153
154 FileSystem fs = Services.get().get(HadoopAccessorService.class).createFileSystem(wfBean.getUser(),
155 wfBean.getGroup(), configDefault.toUri(), new Configuration());
156
157 if (fs.exists(configDefault)) {
158 Configuration defaultConf = new XConfiguration(fs.open(configDefault));
159 PropertiesUtils.checkDisallowedProperties(defaultConf, DISALLOWED_DEFAULT_PROPERTIES);
160 XConfiguration.injectDefaults(defaultConf, conf);
161 }
162
163 PropertiesUtils.checkDisallowedProperties(conf, DISALLOWED_USER_PROPERTIES);
164
165 try {
166 newWfInstance = workflowLib.createInstance(app, conf, jobId);
167 }
168 catch (WorkflowException e) {
169 throw new StoreException(e);
170 }
171 wfBean.setAppName(app.getName());
172 wfBean.setProtoActionConf(protoActionConf.toXmlString());
173 }
174 catch (WorkflowException ex) {
175 throw new CommandException(ex);
176 }
177 catch (IOException ex) {
178 throw new CommandException(ErrorCode.E0803, ex);
179 }
180 catch (HadoopAccessorException e) {
181 throw new CommandException(e);
182 }
183
184 for (int i = 0; i < actions.size(); i++) {
185 if (!nodesToSkip.contains(actions.get(i).getName())) {
186 store.deleteAction(actions.get(i).getId());
187 log.info("Deleting Action[{0}] for re-run", actions.get(i).getId());
188 }
189 else {
190 copyActionData(newWfInstance, oldWfInstance);
191 }
192 }
193
194 wfBean.setAppPath(conf.get(OozieClient.APP_PATH));
195 wfBean.setConf(XmlUtils.prettyPrint(conf).toString());
196 wfBean.setLogToken(conf.get(OozieClient.LOG_TOKEN, ""));
197 wfBean.setUser(conf.get(OozieClient.USER_NAME));
198 wfBean.setGroup(conf.get(OozieClient.GROUP_NAME));
199 wfBean.setExternalId(conf.get(OozieClient.EXTERNAL_ID));
200 wfBean.setEndTime(null);
201 wfBean.setRun(wfBean.getRun() + 1);
202 wfBean.setStatus(WorkflowJob.Status.PREP);
203 wfBean.setWorkflowInstance(newWfInstance);
204 store.updateWorkflow(wfBean);
205 return null;
206 }
207
208 /**
209 * Copys the variables for skipped nodes from the old wfInstance to new one.
210 *
211 * @param newWfInstance
212 * @param oldWfInstance
213 */
214 private void copyActionData(WorkflowInstance newWfInstance, WorkflowInstance oldWfInstance) {
215 Map<String, String> oldVars = new HashMap<String, String>();
216 Map<String, String> newVars = new HashMap<String, String>();
217 oldVars = oldWfInstance.getAllVars();
218 for (String var : oldVars.keySet()) {
219 String actionName = var.split(WorkflowInstance.NODE_VAR_SEPARATOR)[0];
220 if (nodesToSkip.contains(actionName)) {
221 newVars.put(var, oldVars.get(var));
222 }
223 }
224 for (String node : nodesToSkip) {
225 // Setting the TO_SKIP variable to true. This will be used by
226 // SignalCommand and LiteNodeHandler to skip the action.
227 newVars.put(node + WorkflowInstance.NODE_VAR_SEPARATOR + TO_SKIP, "true");
228 String visitedFlag = NodeHandler.getLoopFlag(node);
229 // Removing the visited flag so that the action won't be considered
230 // a loop.
231 if (newVars.containsKey(visitedFlag)) {
232 newVars.remove(visitedFlag);
233 }
234 }
235 newWfInstance.setAllVars(newVars);
236 }
237
238 @Override
239 protected Void execute(WorkflowStore store) throws CommandException, StoreException {
240 try {
241 XLog.getLog(getClass()).debug("STARTED ReRunCommand for job " + jobId);
242 if (lock(jobId)) {
243 call(store);
244 }
245 else {
246 queueCallable(new ReRunCommand(jobId, conf, authToken), LOCK_FAILURE_REQUEUE_INTERVAL);
247 XLog.getLog(getClass()).warn("ReRunCommand lock was not acquired - failed {0}", jobId);
248 }
249 }
250 catch (InterruptedException e) {
251 queueCallable(new ReRunCommand(jobId, conf, authToken), LOCK_FAILURE_REQUEUE_INTERVAL);
252 XLog.getLog(getClass())
253 .warn("ReRunCommand lock was not acquired - interrupted exception failed {0}", jobId);
254 }
255 XLog.getLog(getClass()).debug("ENDED ReRunCommand for job " + jobId);
256 return null;
257 }
258 }