final Watchdog watchdog = Watchdog.getInstance();
watchdog.init(context, mActivityManagerService);
Watchdog 是单例模式,第一次调用 getInstance() 会创建对象并保存到全局变量 sWatchDog 中。WatchDog 构造方法如下:
private Watchdog() {
super("watchdog");
mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
"foreground thread", DEFAULT_TIMEOUT);
mHandlerCheckers.add(mMonitorChecker);
// Add checker for main thread. We only do a quick check since there
// can be UI running on the thread.
mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
"main thread", DEFAULT_TIMEOUT));
// Add checker for shared UI thread.
mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
"ui thread", DEFAULT_TIMEOUT));
// And also check IO thread.
mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
"i/o thread", DEFAULT_TIMEOUT));
// And the display thread.
mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
"display thread", DEFAULT_TIMEOUT));
// Initialize monitor for Binder threads.
addMonitor(new BinderThreadMonitor());
mOpenFdMonitor = OpenFdMonitor.create();
// See the notes on DEFAULT_TIMEOUT.
assert DB ||
DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
}
WatchDog 的构造方法主要工作是创建了一些 HandlerChecker,并把他们保存到 mHandlerCheckers 数组队列中,每个 HandlerChecker 对应一个被监控的线程。HandlerChecker 派生于 Handler 它在构造时就和被监控线程绑定到一起。除了在构造方法创建 HandlerChecker外,还可以通过 WatchDog 的 addThread() 方法来增加被监控的线程。
public void init(Context context, ActivityManagerService activity) {
mResolver = context.getContentResolver();
mActivity = activity;
context.registerReceiver(new RebootRequestReceiver(),
new IntentFilter(Intent.ACTION_REBOOT),
android.Manifest.permission.REBOOT, null);
}
public void addThread(Handler thread, long timeoutMillis) {
synchronized (this) {
if (isAlive()) {
throw new RuntimeException("Threads can't be added once the Watchdog is running");
}
final String name = thread.getLooper().getThread().getName();
mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
}
}
public void addMonitor(Monitor monitor) {
synchronized (this) {
if (isAlive()) {
throw new RuntimeException("Monitors can't be added once the Watchdog is running");
}
mMonitorChecker.addMonitor(monitor);
}
}
addMonitor 方法只是调用了 mMonitorChecker.addMonitor(monitor); 方法。在 WatchDog 构造函数中,将五个公共线程添加到了监控列表。
主线程;FgThread;UiThread;IoThread;DisplayThread
SystemServer 中一些重要的服务有专用的线程来处理消息。同时 SystemServer 也启动了几个线程来为所有服务处理消息。这几个线程没有什么区别,只是线程优先级不同。除了这五个线程外,ActivityManagerService、PackageManagerService、PowerManagerService、WindowManagerService 等服务的线程也加入到了监控中。如果一个服务需要监控首先要实现 Watchdog.Monitor 接口,你看 AMS等都实现了它。
public void run() {
boolean waitedHalf = false;
while (true) {
final List<HandlerChecker> blockedCheckers;
final String subject;
final boolean allowRestart;
int debuggerWasConnected = 0;
synchronized (this) {
long timeout = CHECK_INTERVAL;
// 给监控的线程发送消息
for (int i=0; i<mHandlerCheckers.size(); i++) {
HandlerChecker hc = mHandlerCheckers.get(i);
hc.scheduleCheckLocked();
}
if (debuggerWasConnected > 0) {
debuggerWasConnected--;
}
// 睡眠一段时间
long start = SystemClock.uptimeMillis();
while (timeout > 0) {
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
try {
wait(timeout);
} catch (InterruptedException e) {
Log.wtf(TAG, e);
}
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
}
boolean fdLimitTriggered = false;
if (mOpenFdMonitor != null) {
fdLimitTriggered = mOpenFdMonitor.monitor();
}
if (!fdLimitTriggered) {
// 判断是否有线程出问题了
final int waitState = evaluateCheckerCompletionLocked();
if (waitState == COMPLETED) {
// The monitors have returned; reset
waitedHalf = false;
continue;
} else if (waitState == WAITING) {
// still waiting but within their configured intervals; back off and recheck
continue;
} else if (waitState == WAITED_HALF) {
if (!waitedHalf) {
// We've waited half the deadlock-detection interval. Pull a stack
// trace and wait another half.
ArrayList<Integer> pids = new ArrayList<Integer>();
pids.add(Process.myPid());
ActivityManagerService.dumpStackTraces(true, pids, null, null,
getInterestingNativePids());
waitedHalf = true;
}
continue;
}
// something is overdue!
blockedCheckers = getBlockedCheckersLocked();
subject = describeCheckersLocked(blockedCheckers);
} else {
blockedCheckers = Collections.emptyList();
subject = "Open FD high water mark reached";
}
allowRestart = mAllowRestart;
}
// ...
if (debuggerWasConnected >= 2) {
Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
} else if (debuggerWasConnected > 0) {
Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
} else if (!allowRestart) {
Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
} else {
Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
Slog.w(TAG, "*** GOODBYE!");
// 杀死进程
Process.killProcess(Process.myPid());
System.exit(10);
}
waitedHalf = false;
}
}
run 方法中有一个无限循环,每次主要做三件事
1:调用 scheduleCheckLocked() 给所有现成发送消息 代码如下:
public void scheduleCheckLocked() {
// handlerChecker 对象即要监控服务,又要监控某个线程。因此线判断 mMonitors.size() 是否等于0 。
// 如果为0 说明没有服务需要监控,如果这是被监控线程的消息队列处于空闲状态说明状态良好。
// 更改一下mCompleted状态就可以返回了
if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
mCompleted = true;
return;
}
if (!mCompleted) {
return;
}
// 否则 线把 mCompleted 设置成 false
mCompleted = false;
mCurrentMonitor = null;
// 然后记录开始发送时间 mStartTime
mStartTime = SystemClock.uptimeMillis();
// 然后给监控的线程发送一个消息
mHandler.postAtFrontOfQueue(this);
}
通过mHandler.postAtFrontOfQueue发送的消息的处理方法为 HandlerChecker 的 run 方法如下:
@Override
public void run() {
final int size = mMonitors.size();
for (int i = 0 ; i < size ; i++) {
synchronized (Watchdog.this) {
mCurrentMonitor = mMonitors.get(i);
}
mCurrentMonitor.monitor();
}
synchronized (Watchdog.this) {
mCompleted = true;
mCurrentMonitor = null;
}
}
如果 run 方法能够执行,说明受监控的线程没有问题,但是也需要检查被监控服务的状态,通常 monitor() 方法的实现是获取服务中的锁,如果不能得到线程就会挂起,mCompleted 就不能设置成 true 了。mCompleted为true说明监控的服务正常,否则就有可能有问题。就需要等待超时时间是否到达来判断是否真的有问题。monitor() 实现方法如下
private static final class BinderThreadMonitor implements Watchdog.Monitor {
@Override
public void monitor() {
// 监视以检查绑定线程的可用性。监视器将阻塞,直到未来ipc中有一个绑定器线程可用来处理,以确保其他进程仍然可以与服务通信。
Binder.blockUntilThreadAvailable();
}
}
2: 给受监控的服务发送消息后,调用 wait() 方法让 WatchDog 线程睡眠一段时间。
3: 逐个检查是否线程或者服务出现问题,一旦发现问题立马杀死进程;检查是否正常的方法为 evaluateCheckerCompletionLocked(); 代码如下:
private int evaluateCheckerCompletionLocked() {
int state = COMPLETED;
for (int i=0; i<mHandlerCheckers.size(); i++) {
HandlerChecker hc = mHandlerCheckers.get(i);
state = Math.max(state, hc.getCompletionStateLocked());
}
return state;
}
上面方法通过调用 HandlerChecker 的 hc.getCompletionStateLocked 来判断是否正常方法如下:
public int getCompletionStateLocked() {
if (mCompleted) {
return COMPLETED;
} else {
long latency = SystemClock.uptimeMillis() - mStartTime;
if (latency < mWaitMax/2) {
return WAITING;
} else if (latency < mWaitMax) {
return WAITED_HALF;
}
}
return OVERDUE;
}
如上面方法所示,返回值通常有四个
- static final int COMPLETED = 0; 表示状态良好
- static final int WAITING = 1; 表示正在等待消息处理结果
- static final int WAITED_HALF = 2; 表示正在等待并且时间已经超过了超时时间一半
- static final int OVERDUE = 3; 表示已经超过了超时时间
evaluateCheckerCompletionLocked() 方法采用了 Math.max 想获取到最坏的情况。这样如果返回的是最坏的 OVERDUE 情况,就可以直接杀死进程了。如果是其他情况可以直接进行下一步。