From 168f69db1995c0bce4daad2e4af1558282c2f80c Mon Sep 17 00:00:00 2001 From: Jeffrey Altman Date: Fri, 16 Apr 2010 00:31:58 -0400 Subject: [PATCH] Windows: define new event log messages for cm_Analyze VBUSY, VRESTARTING, etc. Add MSG_SERVER_REPORTS_VBUSY, MSG_SERVER_REPORTS_VRESTARTING, MSG_ALL_SERVERS_BUSY, MSG_ALL_SERVERS_OFFLINE, and MSG_ALL_SERVERS_DOWN. Add event message throttling. Only permit one copy of a message to be generated every five seconds if the message will duplicate the prior message. This often occurs when a server or volume becomes inaccessible and there were a large number of requests queued on it. Integrate these new messages into cm_Analyze processing for VBUSY, VRESTARTING, ALLDOWN, ALLOFFLINE, and ALLBUSY errors. LICENSE MIT Change-Id: Ib30595ded23724810e294133fa75873e20ffbf0b Reviewed-on: http://gerrit.openafs.org/1762 Tested-by: Jeffrey Altman Reviewed-by: Jeffrey Altman --- src/WINNT/afsd/afsd_eventlog.c | 82 ++++++++++++++++++++++++---- src/WINNT/afsd/afsd_eventmessages.mc | 43 ++++++++++++++- src/WINNT/afsd/cm_conn.c | 60 +++++++++++++++++--- 3 files changed, 164 insertions(+), 21 deletions(-) diff --git a/src/WINNT/afsd/afsd_eventlog.c b/src/WINNT/afsd/afsd_eventlog.c index 58027fb02..1122c7cb0 100644 --- a/src/WINNT/afsd/afsd_eventlog.c +++ b/src/WINNT/afsd/afsd_eventlog.c @@ -157,17 +157,26 @@ LogEventMessage(WORD wEventType, DWORD dwEventID, DWORD dwMessageID) // Use the ReportEvent API to write an entry to the system event log. // #define MAXARGS 8 -#define STRLEN 64 +#define STRLEN 128 + VOID LogEvent(WORD wEventType, DWORD dwEventID, ...) { va_list listArgs; HANDLE hEventSource; + HANDLE hMutex = NULL; LPTSTR lpArgs[MAXARGS]; CHAR lpStrings[MAXARGS][STRLEN]; + static CHAR lpLastStrings[MAXARGS][STRLEN]; WORD wNumArgs = 0; - WORD wNumStrings = 0; + static WORD wLastNumArgs = MAXARGS; + static time_t lastMessageTime = 0; + static WORD wLastEventType = 0; + static DWORD dwLastEventID = 0; + time_t now; DWORD code; + BOOL bLogMessage = TRUE; + WORD i = 0, j; // Ensure that our event source is properly initialized. if (!AddEventSource()) @@ -221,6 +230,11 @@ LogEvent(WORD wEventType, DWORD dwEventID, ...) case MSG_SERVER_REPORTS_VSALVAGE: case MSG_SERVER_REPORTS_VNOSERVICE: case MSG_SERVER_REPORTS_VIO: + case MSG_SERVER_REPORTS_VBUSY: + case MSG_SERVER_REPORTS_VRESTARTING: + case MSG_ALL_SERVERS_BUSY: + case MSG_ALL_SERVERS_OFFLINE: + case MSG_ALL_SERVERS_DOWN: wNumArgs = 2; lpArgs[0] = va_arg(listArgs, LPTSTR); StringCbPrintf(lpStrings[1],STRLEN,"%d",va_arg(listArgs,afs_int32)); @@ -307,19 +321,63 @@ LogEvent(WORD wEventType, DWORD dwEventID, ...) // Make sure we were not given too many args. if (wNumArgs >= MAXARGS) - return; + goto done; + + hMutex = CreateMutex( NULL, TRUE, "AFSD Event Log Mutex"); + if (hMutex == NULL) + goto done; + + if (GetLastError() == ERROR_ALREADY_EXISTS) { + code = WaitForSingleObject( hMutex, 500); + if (code != WAIT_OBJECT_0) + goto done; + } + + /* + * We rate limit consecutive duplicate messages to one every + * five seconds. + */ + now = time(NULL); + if (now < lastMessageTime + 5 && + wEventType == wLastEventType && + dwEventID == dwLastEventID && + wNumArgs == wLastNumArgs) { + for (i=0; iname, fidp->volume); + msgID = MSG_ALL_SERVERS_DOWN; + format = "All servers are unreachable when accessing cell %s volume %d."; + LogEvent(EVENTLOG_WARNING_TYPE, msgID, cellp->name, fidp->volume); + } else { + osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLDOWN (VL Server)"); + } } else if (errorCode == CM_ERROR_ALLOFFLINE) { - osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLOFFLINE."); /* Volume instances marked offline will be restored by the * background daemon thread as they become available */ if (fidp) { + osi_Log2(afsd_logp, "cm_Analyze passed CM_ERROR_ALLOFFLINE (FS cell %s vol 0x%x)", + cellp->name, fidp->volume); + msgID = MSG_ALL_SERVERS_OFFLINE; + format = "All servers are offline when accessing cell %s volume %d."; + LogEvent(EVENTLOG_WARNING_TYPE, msgID, cellp->name, fidp->volume); + code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp, CM_GETVOL_FLAG_NO_LRU_UPDATE, &volp); @@ -345,15 +360,21 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, lock_ReleaseRead(&cm_volumeLock); volp = NULL; } - } + } else { + osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLOFFLINE (VL Server)"); + } } else if (errorCode == CM_ERROR_ALLBUSY) { /* Volumes that are busy cannot be determined to be non-busy * without actually attempting to access them. */ - osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLBUSY."); - if (fidp) { /* File Server query */ + osi_Log2(afsd_logp, "cm_Analyze passed CM_ERROR_ALLBUSY (FS cell %s vol 0x%x)", + cellp->name, fidp->volume); + msgID = MSG_ALL_SERVERS_BUSY; + format = "All servers are busy when accessing cell %s volume %d."; + LogEvent(EVENTLOG_WARNING_TYPE, msgID, cellp->name, fidp->volume); + code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp, CM_GETVOL_FLAG_NO_LRU_UPDATE, &volp); @@ -403,6 +424,8 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, volp = NULL; } } else { /* VL Server query */ + osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLBUSY (VL Server)."); + if (timeLeft > 7) { thrd_Sleep(5000); @@ -431,6 +454,30 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, free_svr_list = 1; } } + + switch ( errorCode ) { + case VBUSY: + msgID = MSG_SERVER_REPORTS_VBUSY; + format = "Server %s reported busy when accessing volume %d."; + break; + case VRESTARTING: + msgID = MSG_SERVER_REPORTS_VRESTARTING; + format = "Server %s reported restarting when accessing volume %d."; + break; + } + + if (serverp && fidp) { + /* Log server being offline for this volume */ + sprintf(addr, "%d.%d.%d.%d", + ((serverp->addr.sin_addr.s_addr & 0xff)), + ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8), + ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16), + ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); + + osi_Log2(afsd_logp, format, osi_LogSaveString(afsd_logp,addr), fidp->volume); + LogEvent(EVENTLOG_WARNING_TYPE, msgID, addr, fidp->volume); + } + lock_ObtainWrite(&cm_serverLock); for (tsrp = serversp; tsrp; tsrp=tsrp->next) { if (tsrp->status == srv_deleted) @@ -472,9 +519,6 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, else if (errorCode == VNOVOL || errorCode == VMOVED || errorCode == VOFFLINE || errorCode == VSALVAGE || errorCode == VNOSERVICE || errorCode == VIO) { - char *format; - DWORD msgID; - /* In case of timeout */ reqp->volumeError = errorCode; -- 2.39.5