1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18 package org.apache.hadoop.hbase.catalog;
19
20 import org.apache.commons.logging.Log;
21 import org.apache.commons.logging.LogFactory;
22 import org.apache.hadoop.classification.InterfaceAudience;
23 import org.apache.hadoop.conf.Configuration;
24 import org.apache.hadoop.hbase.Abortable;
25 import org.apache.hadoop.hbase.HRegionInfo;
26 import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException;
27 import org.apache.hadoop.hbase.ServerName;
28 import org.apache.hadoop.hbase.client.HConnection;
29 import org.apache.hadoop.hbase.client.HConnectionManager;
30 import org.apache.hadoop.hbase.client.HTable;
31 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
32 import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
33 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
34 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
35 import org.apache.hadoop.hbase.util.Bytes;
36 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
37 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
38 import org.apache.hadoop.ipc.RemoteException;
39
40 import java.io.EOFException;
41 import java.io.IOException;
42 import java.net.ConnectException;
43 import java.net.NoRouteToHostException;
44 import java.net.SocketException;
45 import java.net.SocketTimeoutException;
46 import java.net.UnknownHostException;
47
48 /**
49 * Tracks the availability of the catalog tables
50 * <code>.META.</code>.
51 *
52 * This class is "read-only" in that the locations of the catalog tables cannot
53 * be explicitly set. Instead, ZooKeeper is used to learn of the availability
54 * and location of <code>.META.</code>.
55 *
56 * <p>Call {@link #start()} to start up operation. Call {@link #stop()}} to
57 * interrupt waits and close up shop.
58 */
59 @InterfaceAudience.Private
60 public class CatalogTracker {
61 // TODO JDC 11/30 We don't even have ROOT anymore, revisit
62 // TODO: This class needs a rethink. The original intent was that it would be
63 // the one-stop-shop for meta locations and that it would get this
64 // info from reading and watching zk state. The class was to be used by
65 // servers when they needed to know of meta movement but also by
66 // client-side (inside in HTable) so rather than figure meta
67 // locations on fault, the client would instead get notifications out of zk.
68 //
69 // But this original intent is frustrated by the fact that this class has to
70 // read an hbase table, the -ROOT- table, to figure out the .META. region
71 // location which means we depend on an HConnection. HConnection will do
72 // retrying but also, it has its own mechanism for finding root and meta
73 // locations (and for 'verifying'; it tries the location and if it fails, does
74 // new lookup, etc.). So, at least for now, HConnection (or HTable) can't
75 // have a CT since CT needs a HConnection (Even then, do want HT to have a CT?
76 // For HT keep up a session with ZK? Rather, shouldn't we do like asynchbase
77 // where we'd open a connection to zk, read what we need then let the
78 // connection go?). The 'fix' is make it so both root and meta addresses
79 // are wholey up in zk -- not in zk (root) -- and in an hbase table (meta).
80 //
81 // But even then, this class does 'verification' of the location and it does
82 // this by making a call over an HConnection (which will do its own root
83 // and meta lookups). Isn't this verification 'useless' since when we
84 // return, whatever is dependent on the result of this call then needs to
85 // use HConnection; what we have verified may change in meantime (HConnection
86 // uses the CT primitives, the root and meta trackers finding root locations).
87 //
88 // When meta is moved to zk, this class may make more sense. In the
89 // meantime, it does not cohere. It should just watch meta and root and not
90 // NOT do verification -- let that be out in HConnection since its going to
91 // be done there ultimately anyways.
92 //
93 // This class has spread throughout the codebase. It needs to be reigned in.
94 // This class should be used server-side only, even if we move meta location
95 // up into zk. Currently its used over in the client package. Its used in
96 // MetaReader and MetaEditor classes usually just to get the Configuration
97 // its using (It does this indirectly by asking its HConnection for its
98 // Configuration and even then this is just used to get an HConnection out on
99 // the other end). I made https://issues.apache.org/jira/browse/HBASE-4495 for
100 // doing CT fixup. St.Ack 09/30/2011.
101 //
102
103 // TODO: Timeouts have never been as advertised in here and its worse now
104 // with retries; i.e. the HConnection retries and pause goes ahead whatever
105 // the passed timeout is. Fix.
106 private static final Log LOG = LogFactory.getLog(CatalogTracker.class);
107 private final HConnection connection;
108 private final ZooKeeperWatcher zookeeper;
109 private final MetaRegionTracker metaRegionTracker;
110 private boolean instantiatedzkw = false;
111 private Abortable abortable;
112
113 private boolean stopped = false;
114
115 static final byte [] META_REGION_NAME =
116 HRegionInfo.FIRST_META_REGIONINFO.getRegionName();
117
118 /**
119 * Constructs a catalog tracker. Find current state of catalog tables.
120 * Begin active tracking by executing {@link #start()} post construction. Does
121 * not timeout.
122 *
123 * @param conf
124 * the {@link Configuration} from which a {@link HConnection} will be
125 * obtained; if problem, this connections
126 * {@link HConnection#abort(String, Throwable)} will be called.
127 * @throws IOException
128 */
129 public CatalogTracker(final Configuration conf) throws IOException {
130 this(null, conf, null);
131 }
132
133 /**
134 * Constructs the catalog tracker. Find current state of catalog tables.
135 * Begin active tracking by executing {@link #start()} post construction.
136 * Does not timeout.
137 * @param zk If zk is null, we'll create an instance (and shut it down
138 * when {@link #stop()} is called) else we'll use what is passed.
139 * @param conf
140 * @param abortable If fatal exception we'll call abort on this. May be null.
141 * If it is we'll use the Connection associated with the passed
142 * {@link Configuration} as our Abortable.
143 * @throws IOException
144 */
145 public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
146 Abortable abortable)
147 throws IOException {
148 this(zk, conf, HConnectionManager.getConnection(conf), abortable);
149 }
150
151 public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
152 HConnection connection, Abortable abortable)
153 throws IOException {
154 this.connection = connection;
155 if (abortable == null) {
156 // A connection is abortable.
157 this.abortable = this.connection;
158 }
159 Abortable throwableAborter = new Abortable() {
160
161 @Override
162 public void abort(String why, Throwable e) {
163 throw new RuntimeException(why, e);
164 }
165
166 @Override
167 public boolean isAborted() {
168 return true;
169 }
170
171 };
172 if (zk == null) {
173 // Create our own. Set flag so we tear it down on stop.
174 this.zookeeper =
175 new ZooKeeperWatcher(conf, "catalogtracker-on-" + connection.toString(),
176 abortable);
177 instantiatedzkw = true;
178 } else {
179 this.zookeeper = zk;
180 }
181 this.metaRegionTracker = new MetaRegionTracker(zookeeper, throwableAborter);
182 }
183
184 /**
185 * Starts the catalog tracker.
186 * Determines current availability of catalog tables and ensures all further
187 * transitions of either region are tracked.
188 * @throws IOException
189 * @throws InterruptedException
190 */
191 public void start() throws IOException, InterruptedException {
192 LOG.debug("Starting catalog tracker " + this);
193 try {
194 this.metaRegionTracker.start();
195 } catch (RuntimeException e) {
196 Throwable t = e.getCause();
197 this.abortable.abort(e.getMessage(), t);
198 throw new IOException("Attempt to start meta tracker failed.", t);
199 }
200 }
201
202 /**
203 * Stop working.
204 * Interrupts any ongoing waits.
205 */
206 public void stop() {
207 if (!this.stopped) {
208 LOG.debug("Stopping catalog tracker " + this);
209 this.stopped = true;
210 this.metaRegionTracker.stop();
211 try {
212 if (this.connection != null) {
213 this.connection.close();
214 }
215 } catch (IOException e) {
216 // Although the {@link Closeable} interface throws an {@link
217 // IOException}, in reality, the implementation would never do that.
218 LOG.error("Attempt to close catalog tracker's connection failed.", e);
219 }
220 if (this.instantiatedzkw) {
221 this.zookeeper.close();
222 }
223 }
224 }
225
226 /**
227 * Gets the current location for <code>.META.</code> or null if location is
228 * not currently available.
229 * @return {@link ServerName} for server hosting <code>.META.</code> or null
230 * if none available
231 * @throws InterruptedException
232 */
233 public ServerName getMetaLocation() throws InterruptedException {
234 return this.metaRegionTracker.getMetaRegionLocation();
235 }
236
237 /**
238 * Checks whether meta regionserver znode has some non null data.
239 * @return true if data is not null, false otherwise.
240 */
241 public boolean isMetaLocationAvailable() {
242 return this.metaRegionTracker.isLocationAvailable();
243 }
244 /**
245 * Gets the current location for <code>.META.</code> if available and waits
246 * for up to the specified timeout if not immediately available. Returns null
247 * if the timeout elapses before root is available.
248 * @param timeout maximum time to wait for root availability, in milliseconds
249 * @return {@link ServerName} for server hosting <code>.META.</code> or null
250 * if none available
251 * @throws InterruptedException if interrupted while waiting
252 * @throws NotAllMetaRegionsOnlineException if meta not available before
253 * timeout
254 */
255 public ServerName waitForMeta(final long timeout)
256 throws InterruptedException, NotAllMetaRegionsOnlineException {
257 ServerName sn = metaRegionTracker.waitMetaRegionLocation(timeout);
258 if (sn == null) {
259 throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms");
260 }
261 return sn;
262 }
263
264 /**
265 * Gets a connection to the server hosting meta, as reported by ZooKeeper,
266 * waiting up to the specified timeout for availability.
267 * @param timeout How long to wait on meta location
268 * @see #waitForMeta for additional information
269 * @return connection to server hosting meta
270 * @throws InterruptedException
271 * @throws NotAllMetaRegionsOnlineException if timed out waiting
272 * @throws IOException
273 * @deprecated Use #getMetaServerConnection(long)
274 */
275 public AdminService.BlockingInterface waitForMetaServerConnection(long timeout)
276 throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
277 return getMetaServerConnection(timeout);
278 }
279
280 /**
281 * Gets a connection to the server hosting meta, as reported by ZooKeeper,
282 * waiting up to the specified timeout for availability.
283 * <p>WARNING: Does not retry. Use an {@link HTable} instead.
284 * @param timeout How long to wait on meta location
285 * @see #waitForMeta for additional information
286 * @return connection to server hosting meta
287 * @throws InterruptedException
288 * @throws NotAllMetaRegionsOnlineException if timed out waiting
289 * @throws IOException
290 */
291 AdminService.BlockingInterface getMetaServerConnection(long timeout)
292 throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
293 return getCachedConnection(waitForMeta(timeout));
294 }
295
296 /**
297 * Waits indefinitely for availability of <code>.META.</code>. Used during
298 * cluster startup. Does not verify meta, just that something has been
299 * set up in zk.
300 * @see #waitForMeta(long)
301 * @throws InterruptedException if interrupted while waiting
302 */
303 public void waitForMeta() throws InterruptedException {
304 while (!this.stopped) {
305 try {
306 if (waitForMeta(100) != null) break;
307 } catch (NotAllMetaRegionsOnlineException e) {
308 if (LOG.isTraceEnabled()) {
309 LOG.info(".META. still not available, sleeping and retrying." +
310 " Reason: " + e.getMessage());
311 }
312 }
313 }
314 }
315
316 /**
317 * @param sn ServerName to get a connection against.
318 * @return The AdminProtocol we got when we connected to <code>sn</code>
319 * May have come from cache, may not be good, may have been setup by this
320 * invocation, or may be null.
321 * @throws IOException
322 */
323 private AdminService.BlockingInterface getCachedConnection(ServerName sn)
324 throws IOException {
325 if (sn == null) {
326 return null;
327 }
328 AdminService.BlockingInterface service = null;
329 try {
330 service = connection.getAdmin(sn);
331 } catch (RetriesExhaustedException e) {
332 if (e.getCause() != null && e.getCause() instanceof ConnectException) {
333 // Catch this; presume it means the cached connection has gone bad.
334 } else {
335 throw e;
336 }
337 } catch (SocketTimeoutException e) {
338 LOG.debug("Timed out connecting to " + sn);
339 } catch (NoRouteToHostException e) {
340 LOG.debug("Connecting to " + sn, e);
341 } catch (SocketException e) {
342 LOG.debug("Exception connecting to " + sn);
343 } catch (UnknownHostException e) {
344 LOG.debug("Unknown host exception connecting to " + sn);
345 } catch (IOException ioe) {
346 Throwable cause = ioe.getCause();
347 if (ioe instanceof ConnectException) {
348 // Catch. Connect refused.
349 } else if (cause != null && cause instanceof EOFException) {
350 // Catch. Other end disconnected us.
351 } else if (cause != null && cause.getMessage() != null &&
352 cause.getMessage().toLowerCase().contains("connection reset")) {
353 // Catch. Connection reset.
354 } else {
355 throw ioe;
356 }
357
358 }
359 return service;
360 }
361
362 /**
363 * Verify we can connect to <code>hostingServer</code> and that its carrying
364 * <code>regionName</code>.
365 * @param hostingServer Interface to the server hosting <code>regionName</code>
366 * @param address The servername that goes with the <code>metaServer</code>
367 * Interface. Used logging.
368 * @param regionName The regionname we are interested in.
369 * @return True if we were able to verify the region located at other side of
370 * the Interface.
371 * @throws IOException
372 */
373 // TODO: We should be able to get the ServerName from the AdminProtocol
374 // rather than have to pass it in. Its made awkward by the fact that the
375 // HRI is likely a proxy against remote server so the getServerName needs
376 // to be fixed to go to a local method or to a cache before we can do this.
377 private boolean verifyRegionLocation(AdminService.BlockingInterface hostingServer,
378 final ServerName address, final byte [] regionName)
379 throws IOException {
380 if (hostingServer == null) {
381 LOG.info("Passed hostingServer is null");
382 return false;
383 }
384 Throwable t = null;
385 try {
386 // Try and get regioninfo from the hosting server.
387 return ProtobufUtil.getRegionInfo(hostingServer, regionName) != null;
388 } catch (ConnectException e) {
389 t = e;
390 } catch (RetriesExhaustedException e) {
391 t = e;
392 } catch (RemoteException e) {
393 IOException ioe = e.unwrapRemoteException();
394 t = ioe;
395 } catch (IOException e) {
396 Throwable cause = e.getCause();
397 if (cause != null && cause instanceof EOFException) {
398 t = cause;
399 } else if (cause != null && cause.getMessage() != null
400 && cause.getMessage().contains("Connection reset")) {
401 t = cause;
402 } else {
403 t = e;
404 }
405 }
406 LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) +
407 " at address=" + address + ", exception=" + t);
408 return false;
409 }
410
411 /**
412 * Verify <code>.META.</code> is deployed and accessible.
413 * @param timeout How long to wait on zk for meta address (passed through to
414 * the internal call to {@link #waitForMetaServerConnection(long)}.
415 * @return True if the <code>.META.</code> location is healthy.
416 * @throws IOException
417 * @throws InterruptedException
418 */
419 public boolean verifyMetaRegionLocation(final long timeout)
420 throws InterruptedException, IOException {
421 AdminService.BlockingInterface service = null;
422 try {
423 service = waitForMetaServerConnection(timeout);
424 } catch (NotAllMetaRegionsOnlineException e) {
425 // Pass
426 } catch (ServerNotRunningYetException e) {
427 // Pass -- remote server is not up so can't be carrying root
428 } catch (UnknownHostException e) {
429 // Pass -- server name doesn't resolve so it can't be assigned anything.
430 }
431 return (service == null)? false:
432 verifyRegionLocation(service,
433 this.metaRegionTracker.getMetaRegionLocation(), META_REGION_NAME);
434 }
435
436 public HConnection getConnection() {
437 return this.connection;
438 }
439 }