From: Steven Jenkins Date: Fri, 14 Mar 2008 18:05:00 +0000 (+0000) Subject: demand-attach-docs-20080314 X-Git-Tag: BP-openafs-windows-kdfs-ifs~12 X-Git-Url: https://git.michaelhowe.org/gitweb/?a=commitdiff_plain;h=8cad4305acb787500f9434949cfb3c6500f3287b;p=packages%2Fo%2Fopenafs.git demand-attach-docs-20080314 LICENSE IPL10 Additional documentation for the demand attach file server, plus a state diagram. --- diff --git a/doc/arch/README b/doc/arch/README new file mode 100644 index 000000000..9ae5f010d --- /dev/null +++ b/doc/arch/README @@ -0,0 +1,7 @@ +dafs-fsa.dot is a description of the finite-state machine for volume +states in the Demand Attach fileserver in Dot (http://www.graphviz.org) +format. An invocation like: + + dot -Tsvg dafs-fsa.dot > dafs-fsa.svg + +will convert the description to an SVG file. diff --git a/doc/arch/dafs-fsa.dot b/doc/arch/dafs-fsa.dot new file mode 100644 index 000000000..565de7122 --- /dev/null +++ b/doc/arch/dafs-fsa.dot @@ -0,0 +1,109 @@ +# +# This is a dot (http://www.graphviz.org) description of the various +# states volumes can be in for DAFS (Demand Attach File Server). +# +# Author: Steven Jenkins +# Date: 2007-05-24 +# + +digraph VolumeStates { + size="11,17" + graph [ + rankdir = "TB" + ]; + + subgraph clusterKey { + rankdir="LR"; + shape = "rectangle"; + + s1 [ shape=plaintext, label = "VPut after VDetach in brown", + fontcolor="brown" ]; + s2 [ shape=plaintext, label = "VAttach in blue", + fontcolor="blue" ]; + s3 [ shape=plaintext, label = "VGet/VHold in purple", + fontcolor="purple" ]; + s4 [ shape=plaintext, label = "Error States in red", + fontcolor="red" ]; + s5 [ shape=plaintext, label = "VPut after VOffline in green", + fontcolor="green" ]; + s6 [ shape=ellipse, label = "re-entrant" ]; + s7 [ shape=ellipse, peripheries=2, label="non re-entrant" ]; + s8 [ shape=ellipse, color="red", label="Error States" ]; + + s6->s7->s8->s1->s2->s3->s4->s5 [style="invis"]; + + } + + node [ peripheries = "2" ] ATTACHING \ + LOADING_VNODE_BITMAPS HDR_LOADING_FROM_DISK \ + HDR_ATTACHING_LRU_PULL \ + "UPDATING\nSYNCING_VOL_HDR_TO_DISK" \ + OFFLINING DETACHING; + node [ shape = "ellipse", peripheries = "1" ]; + node [ color = "red" ] HARD_ERROR SALVAGE_REQUESTED SALVAGING; + + node [ color = "black" ]; // default back to black + + UNATTACHED->Exclusive_vol_op_executing [label = "controlled by FSSYNC" ]; + Exclusive_vol_op_executing->UNATTACHED [label = "controlled by FSSYNC" ]; + UNATTACHED->FREED [ label = "VCancelReservation_r() after a\nVDetach() or FreeVolume() will\ncause CheckDetach() or CheckFree() to fire" ]; + OFFLINING->UNATTACHED; + UNATTACHED->PREATTACHED [ color = "orange", label = "PreAttach()" ]; + PREATTACHED->UNATTACHED [ color = "orange", label = "VOffline()"]; + HARD_ERROR->PREATTACHED [ color = "orange", label = "operator intervention via FSSYNC" ]; + + PREATTACHED->Exclusive_vol_op_executing [color = "orange", label = "controlled by FSSYNC" ]; + Exclusive_vol_op_executing->PREATTACHED [color = "orange", label = "controlled by FSSYNC" ]; + PREATTACHED->FREED [ color = "orange", label = "VCancelReservation_r() after a\nVDetach() or FreeVolume() will\ncause CheckDetach() or CheckFree() to fire" ]; + PREATTACHED->ATTACHING [ color = "blue", weight = "8" ]; + SALVAGING->PREATTACHED [ label = "controlled via FSSYNC" ]; + + DETACHING->FREED ; + SHUTTING_DOWN->DETACHING [ color = "brown" ]; + ATTACHED_nUsers_GT_0->SHUTTING_DOWN [ color = "orange", label = "VDetach()" ]; + + DETACHING->"UPDATING\nSYNCING_VOL_HDR_TO_DISK" [ color = "brown" ]; + "UPDATING\nSYNCING_VOL_HDR_TO_DISK"->DETACHING [ color = "brown" ]; + OFFLINING->"UPDATING\nSYNCING_VOL_HDR_TO_DISK" [ color = "green" ]; + "UPDATING\nSYNCING_VOL_HDR_TO_DISK"->OFFLINING [ color = "green" ]; + GOING_OFFLINE->OFFLINING [ color = "green" ]; + + "UPDATING\nSYNCING_VOL_HDR_TO_DISK"->SALVAGE_REQUESTED [ color = "red" ]; + "UPDATING\nSYNCING_VOL_HDR_TO_DISK"->ATTACHING [ color = "blue" ]; + ATTACHING->"UPDATING\nSYNCING_VOL_HDR_TO_DISK" [ color = "blue" ]; + + ATTACHED_nUsers_GT_0->GOING_OFFLINE [ color = "orange", label = "VOffline" ]; + ATTACHED_nUsers_GT_0->ATTACHED_nUsers_EQ_0 [ color = "orange", label = "VPut" ]; + + ATTACHED_nUsers_GT_0->SALVAGE_REQUESTED [ color = "red" ]; + + LOADING_VNODE_BITMAPS->ATTACHING [ color = "blue" ]; + ATTACHING->LOADING_VNODE_BITMAPS [ color = "blue" ] ; + LOADING_VNODE_BITMAPS->SALVAGE_REQUESTED [ color = "red" ]; + HDR_LOADING_FROM_DISK->SALVAGE_REQUESTED [ color = "red" ]; + HDR_LOADING_FROM_DISK->ATTACHING [ color = "blue" ] ; + HDR_LOADING_FROM_DISK->ATTACHED_nUsers_GT_0 [ color = "purple" ]; + + SALVAGE_REQUESTED->SALVAGING [ label = "controlled via FSSYNC" ]; + SALVAGE_REQUESTED->HARD_ERROR [ color = "red", + label = "After hard salvage limit reached,\n hard error state is in effect\nuntil there is operator intervention" ]; + + HDR_ATTACHING_LRU_PULL->HDR_LOADING_FROM_DISK [ color = "blue" ]; + HDR_ATTACHING_LRU_PULL->HDR_LOADING_FROM_DISK [ color = "purple" ]; + HDR_ATTACHING_LRU_PULL->ATTACHED_nUsers_GT_0 [ color = "purple", label = "header can be in LRU\nand not have been reclaimed\nthus skipping disk I/O" ]; + + ATTACHING->HDR_ATTACHING_LRU_PULL [ color = "blue" ]; + ATTACHING->ATTACHED_nUsers_EQ_0 [ color = "blue" ]; + + ATTACHING->SALVAGE_REQUESTED [ color = "red" ]; + ATTACHED_nUsers_EQ_0->HDR_ATTACHING_LRU_PULL [ color = "purple" ]; + + ATTACHED_nUsers_EQ_0->SALVAGE_REQUESTED [ color = "red" ]; + + // Various loopback transitions + GOING_OFFLINE->GOING_OFFLINE [ label = "VPut when (nUsers > 1)" ]; + SHUTTING_DOWN->SHUTTING_DOWN + [ label = "VPut when ((nUsers > 1) ||\n((nUsers == 1) && (nWaiters > 0)))" ]; + SHUTTING_DOWN->SHUTTING_DOWN + [ label = "VCancelReservation_r when ((nWaiters > 1)\n|| ((nWaiters == 1) && (nUsers > 0)))"]; +} diff --git a/doc/man-pages/pod8/bos_create.pod b/doc/man-pages/pod8/bos_create.pod index 4c3cc5c12..1f1001ad0 100644 --- a/doc/man-pages/pod8/bos_create.pod +++ b/doc/man-pages/pod8/bos_create.pod @@ -45,8 +45,9 @@ fully-qualified or abbreviated unambiguously). For details, see L. Names the process to define and start. Any name is acceptable, but for the sake of simplicity it is best to use the last element of the process's -binary file pathname, and to use the same name on every server -machine. The conventional names, as used in all AFS documentation, are: +binary file pathname (or the instance type for B and B), and to +use the same name on every server machine. The conventional names, as used +in all AFS documentation, are: =over 4 @@ -54,6 +55,12 @@ machine. The conventional names, as used in all AFS documentation, are: The Backup Server process. +=item dafs + +The process that combines the Demand Attach File Server, Volume Server, +Salvageserver and Salvager processes (B, B, +B, and B). + =item fs The process that combines the File Server, Volume Server, and Salvager @@ -69,7 +76,7 @@ The Protection Server process. =item runntp -The controller process for the Network Time Protocol Daemon. +The controller process for the Network Time Protocol Daemon (obsolete). =item upclientbin @@ -113,11 +120,11 @@ command. =item dafs -Use this value only for the dafs process, which combines the -File Server, Volume Server, Salvageserver, and Salvager processes in -order to operate as a Demand Attach File Server. If one of the -component processes terminates, the BOS Server shuts down -and restarts the process in the appropriate order. +Use this value only for the dafs process, which combines the File Server, +Volume Server, Salvageserver, and Salvager processes in order to operate +as a Demand Attach File Server. If one of the component processes +terminates, the BOS Server shuts down and restarts the process in the +appropriate order. =item fs @@ -129,9 +136,9 @@ appropriate order. =item simple Use this value for all processes listed as acceptable values to the -B<-instance> argument, except for the B process. There are no -interdependencies between simple processes, so the BOS Server can stop and -start them independently as necessary. +B<-instance> argument, except for the B and B processes. +There are no interdependencies between simple processes, so the +BOS Server can stop and start them independently as necessary. =back @@ -258,6 +265,13 @@ C. Type the command on a single line. -cmd /usr/afs/bin/fileserver /usr/afs/bin/volserver \ /usr/afs/bin/salvager +The following command creates the dafs process dafs on the machine +C. Type the command on a single line. + + % bos create -server fs4.abc.com -instance dafs -type dafs \ + -cmd /usr/afs/bin/fileserver /usr/afs/bin/volserver \ + /usr/afs/bin/salvageserver /usr/afs/bin/salvager + The following command creates a cron process called C on the machine C, so that the BOS Server issues the indicated B command each day at 3:00 a.m. (the command creates a backup @@ -383,6 +397,7 @@ L, L, L, L, +L, L, L, L, diff --git a/doc/man-pages/pod8/fileserver.pod b/doc/man-pages/pod8/fileserver.pod index b3813eccc..19e5f5cd7 100644 --- a/doc/man-pages/pod8/fileserver.pod +++ b/doc/man-pages/pod8/fileserver.pod @@ -536,8 +536,8 @@ This option is only supported by the demand-attach file server. =item B<-vlruinterval > -The number of seconds between VLRU candidate queue scan default is 120 s. -The second s. +The number of seconds between VLRU candidate queue scan. The default is +120 seconds. This option is only supported by the demand-attach file server. diff --git a/doc/man-pages/pod8/salvageserver.pod b/doc/man-pages/pod8/salvageserver.pod new file mode 100644 index 000000000..26a1ee628 --- /dev/null +++ b/doc/man-pages/pod8/salvageserver.pod @@ -0,0 +1,328 @@ +=head1 NAME + +salvageserver - Initializes the Salvageserver component of the dafs process + +=head1 SYNOPSIS + +=for html +
+ +B [I] S<<< [B<-partition> >] >>> + S<<< [B<-volumeid> >] >>> [B<-debug>] [B<-nowrite>] + [B<-inodes>] [B<-force>] [B<-oktozap>] [B<-rootinodes>] + [B<-salvagedirs>] [B<-blockreads>] + S<<< [B<-parallel> >] >>> + S<<< [B<-tmpdir> >] >>> + [B<-showlog>] [B<-showsuid>] [B<-showmounts>] + S<<< [B<-orphans> (ignore | remove | attach)] >>> + [B<-client>] [B<-help>] + +=for html +
+ +=head1 DESCRIPTION + +In its typical mode of operation, the B is a daemon process +responsible for salvaging volumes. It is a component of the C +process type. In the conventional configuration, its binary file is +located in the F directory on a file server machine. + +The Salvageserver daemon is responsible for scheduling and executing +volume salvage operations on behalf of client processes. The fileserver +acts as the primary salvageserver client: any failed volume attach +operation results in a salvageserver scheduling request. The +salvageserver also accepts periodic volume activity messages in order to +update its salvage request priority queue. Other clients of the +salvageserver daemon include the B utility, and the +salvageserver command itself by passing the B<-client> flag. + +The salvage operations performed on vice partition data are nearly +identical to those performed by the standalone Salvager command. The +key differences between the two commands are: + +=over 4 + +=item * + +The Salvageserver is a daemon process which runs concurrently with the +fileserver. In contrast, the Salvager is a stand-alone application which +is invoked when the fileserver and volserver are not running. + +=item * + +The Salvageserver is incapable of performing whole partition salvage +operations; it operates at volume group granularity. + +=back + +The Salvageserver normally creates new inodes as it repairs damage. If the +partition is so full that there is no room for new inodes, use the +B<-nowrite> argument to bringing undamaged volumes online without +attempting to salvage damaged volumes. Then use the B command to +move one or more of the undamaged volumes to other partitions, freeing up +the space that the Salvageserver needs to create new inodes. + +By default, multiple Salvageserver subprocesses run in parallel: one for each +volume group. By default, four concurrent salvage operations are +permitted. You may alter this default by providing a positive integer +value for the B<-parallel> argument. The maximum permitted value is 32 +concurrent salvageserver subprocesses. + +By default, the salvageserver enables a heuristic which attempts to stop +disk head thrashing by concurrent salvageserver subprocesses. Unfortunately, +this heuristic significantly degrades performance in many cases. In at least +the following environments, passing the C string to the B<-parallel> +argument is strongly encouraged: + +=over 4 + +=item * + +On NAMEI fileservers + +=item * + +When a vice partition is backed by multiple disks (e.g. RAID) + +=item * + +When a vice partition is backed by SAN-attached storage, LVM, or some other +form of storage virtualization which would cause unix device id numbers to +be unpredictable. + +=back + +The Salvageserver creates temporary files as it runs, by default writing them +to the partition it is salvaging. The number of files can be quite large, +and if the partition is too full to accommodate them, the Salvageserver +terminates without completing the salvage operation (it always removes the +temporary files before exiting). Other Salvageserver subprocesses running at +the same time continue until they finish salvaging all other partitions +where there is enough disk space for temporary files. To complete the +interrupted salvage, reissue the command against the appropriate +partitions, adding the B<-tmpdir> argument to redirect the temporary files +to a local disk directory that has enough space. + +The B<-orphans> argument controls how the Salvageserver handles orphaned files +and directories that it finds on server partitions it is salvaging. An +I element is completely inaccessible because it is not +referenced by the vnode of any directory that can act as its parent (is +higher in the filespace). Orphaned objects occupy space on the server +partition, but do not count against the volume's quota. + +To generate a list of all mount points that reside in one or more volumes, +rather than actually salvaging them, include the B<-showmounts> flag. + +This command does not use the syntax conventions of the AFS command +suites. Provide the command name and all option names in full. + +=head1 OPTIONS + +=over 4 + +=item [I] + +Accommodates the command's use of the AFS command parser, and is optional. + +=item B<-partition> > + +Specifies the name of the partition to salvage. Specify the full partition +name using the form F> or F>. Omit this argument to +salvage every partition on the file server machine. + +=item B<-volumeid> > + +Specifies the volume ID of a specific read/write volume to salvage. The +B<-partition> argument must be provided along with this one and specify +the volume's actual site. + +=item B<-debug> + +This flag should be considered deprecated. Its primary purpose was to disable +forking and parallelization of the Salvager so that log messages were not +interleaved. Due to the manner in which F is +written, log messages from subprocesses are never interleaved; the entire log +for a volume group salvage is appended to the master log as one atomic +transaction. + +=item B<-nowrite> + +Brings all undamaged volumes online without attempting to salvage any +damaged volumes. + +=item B<-inodes> + +Records in the F file a list of all AFS inodes +that the Salvageserver modified. + +=item B<-force> + +Inspects all volumes for corruption, not just those that are marked as +having been active when a crash occurred. + +=item B<-oktozap> + +Removes a volume that is so damaged that even issuing the B +command with the B<-force> flag is ineffective. Use this argument only in +consultation with AFS Development or Product Support. Combine it with the +B<-partition> and B<-volumeid> arguments to identify the volume to remove. + +=item B<-rootinodes> + +Records in the F file a list of all AFS inodes +owned by the local superuser C. + +=item B<-salvagedirs> + +Salvages entire directory structures, even if they do not appear to be +damaged. By default, the Salvageserver salvages a directory only if it is +flagged as corrupted. + +=item B<-blockreads> + +Forces the Salvageserver to read a partition one disk block (512 bytes) at a +time and to skip any blocks that are too badly damaged to be salvaged. +This allows it to salvage as many volumes as possible. By default, the +Salvageserver reads large disk blocks, which can cause it to exit prematurely +if it encounters disk errors. Use this flag if the partition to be +salvaged has disk errors. + +=item B<-parallel> > + +Specifies the maximum number of Salvageserver subprocesses to run in parallel. +Provide one of three values: + +=over 4 + +=item * + +An integer from the range C<1> to C<32>. A value of C<1> means that a +single Salvageserver subprocess salvages the volume groups sequentially. +The disk partition heuristic (see above) based upon unix device ids is +enabled. + +=item * + +The disk partition heuristic (see above) based upon unix device ids is +disabled. + +=item * + +The string C followed immediately (with no intervening space) by an +integer from the range C<1> to C<32>, to run the specified number of +Salvageserver subprocesses in parallel on volume groups. The disk partition +heuristic (see above) based upon unix device ids is disabled. + +=back + +If this argument is omitted, up to four Salvageserver subprocesses run +in parallel. + +=item B<-tmpdir> > + +Names a local disk directory in which the Salvageserver places the temporary +files it creates during a salvage operation, instead of writing them to +the partition being salvaged (the default). If the Salvageserver cannot write +to the specified directory, it attempts to write to the partition being +salvaged. + +=item B<-showlog> + +Displays on the standard output stream all log data that is being written +to the F file. + +=item B<-showsuid> + +Displays a list of the pathnames for all files that have the setuid or +setgid mode bit set. + +=item B<-showmounts> + +Records in the F file all mount points found in +each volume. The Salvageserver does not repair corruption in the volumes, if +any exists. + +=item B<-orphans> (ignore | remove | attach) + +Controls how the Salvageserver handles orphaned files and directories. Choose +one of the following three values: + +=over 4 + +=item ignore + +Leaves the orphaned objects on the disk, but prints a message to the +F file reporting how many orphans were found and +the approximate number of kilobytes they are consuming. This is the +default if the B<-orphans> argument is omitted. + +=item remove + +Removes the orphaned objects, and prints a message to the +F file reporting how many orphans were removed +and the approximate number of kilobytes they were consuming. + +=item attach + +Attaches the orphaned objects by creating a reference to them in the vnode +of the volume's root directory. Since each object's actual name is now +lost, the Salvageserver assigns each one a name of the following form: + +=over 4 + +=item C<__ORPHANFILE__.I> for files. + +=item C<__ORPHANDIR__.I> for directories. + +=back + +where I is a two-digit number that uniquely identifies each +object. The orphans are charged against the volume's quota and appear in +the output of the B command issued against the volume's root +directory. + +=back + +=item B<-client> + +Salvageserver runs in client Mode. The requested volume on the requested +partition will be scheduled for salvaging by the Salvageserver daemon. + +=item B<-help> + +Prints the online help for this command. All other valid options are +ignored. + +=back + +=head1 EXAMPLES + +The following command instructs the Salvageserver to schedule the salvage +of the volume with volume ID 258347486 on F on the local machine. + + % /usr/afs/bin/salvageserver -partition /vicepg -volumeid 258347486 -client + +=head1 PRIVILEGE REQUIRED + +To issue the command at the shell prompt, the issuer must be logged in as +the local superuser C. + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L + +=head1 COPYRIGHT + +IBM Corporation 2000. All Rights Reserved. +Sine Nomine Associates 2008. All Rights Reserved. + +This documentation is covered by the IBM Public License Version 1.0. It was +converted from HTML to POD by software written by Chas Williams and Russ +Allbery, based on work by Alf Wachsmann and Elizabeth Cassell. This document +was adapted from the Salvager POD documentation.