From: Steven Jenkins <steven.jenkins@gmail.com>
Date: Fri, 14 Mar 2008 18:05:00 +0000 (+0000)
Subject: demand-attach-docs-20080314
X-Git-Tag: BP-openafs-windows-kdfs-ifs~12
X-Git-Url: https://git.michaelhowe.org/gitweb/?a=commitdiff_plain;h=8cad4305acb787500f9434949cfb3c6500f3287b;p=packages%2Fo%2Fopenafs.git

demand-attach-docs-20080314

LICENSE IPL10

Additional documentation for the demand attach file server, plus a state
diagram.
---

diff --git a/doc/arch/README b/doc/arch/README
new file mode 100644
index 000000000..9ae5f010d
--- /dev/null
+++ b/doc/arch/README
@@ -0,0 +1,7 @@
+dafs-fsa.dot is a description of the finite-state machine for volume
+states in the Demand Attach fileserver in Dot (http://www.graphviz.org)
+format.  An invocation like:
+
+    dot -Tsvg dafs-fsa.dot > dafs-fsa.svg
+
+will convert the description to an SVG file.
diff --git a/doc/arch/dafs-fsa.dot b/doc/arch/dafs-fsa.dot
new file mode 100644
index 000000000..565de7122
--- /dev/null
+++ b/doc/arch/dafs-fsa.dot
@@ -0,0 +1,109 @@
+#
+# This is a dot (http://www.graphviz.org) description of the various
+# states volumes can be in for DAFS (Demand Attach File Server).
+#
+# Author: Steven Jenkins
+# Date: 2007-05-24
+#
+
+digraph VolumeStates {
+	size="11,17"
+	graph [
+		rankdir = "TB"
+	];
+
+	subgraph clusterKey {
+		rankdir="LR";
+		shape = "rectangle";
+
+	s1 [ shape=plaintext, label = "VPut after VDetach in brown",
+			fontcolor="brown" ];
+	s2 [ shape=plaintext, label = "VAttach in blue",
+			fontcolor="blue" ];
+	s3 [ shape=plaintext, label = "VGet/VHold in purple",
+			fontcolor="purple" ];
+	s4 [ shape=plaintext, label = "Error States in red",
+			fontcolor="red" ];
+	s5 [ shape=plaintext, label = "VPut after VOffline in green",
+			fontcolor="green" ];
+	s6 [ shape=ellipse, label = "re-entrant" ];
+	s7 [ shape=ellipse, peripheries=2, label="non re-entrant" ];
+	s8 [ shape=ellipse, color="red", label="Error States" ];
+
+	s6->s7->s8->s1->s2->s3->s4->s5 [style="invis"];
+
+	}
+
+	node [ peripheries = "2" ] ATTACHING \
+		LOADING_VNODE_BITMAPS HDR_LOADING_FROM_DISK \
+		HDR_ATTACHING_LRU_PULL \
+		"UPDATING\nSYNCING_VOL_HDR_TO_DISK" \
+		OFFLINING DETACHING;
+	node [ shape = "ellipse", peripheries = "1" ];
+	node [ color = "red" ] HARD_ERROR SALVAGE_REQUESTED SALVAGING;
+
+	node [ color = "black" ]; // default back to black
+
+	UNATTACHED->Exclusive_vol_op_executing [label = "controlled by FSSYNC" ];
+	Exclusive_vol_op_executing->UNATTACHED  [label = "controlled by FSSYNC" ];
+	UNATTACHED->FREED [ label = "VCancelReservation_r() after a\nVDetach() or FreeVolume() will\ncause CheckDetach() or CheckFree() to fire" ];
+	OFFLINING->UNATTACHED;
+	UNATTACHED->PREATTACHED [ color = "orange", label = "PreAttach()" ];
+	PREATTACHED->UNATTACHED [ color = "orange", label = "VOffline()"];
+	HARD_ERROR->PREATTACHED [ color = "orange", label = "operator intervention via FSSYNC" ];
+
+	PREATTACHED->Exclusive_vol_op_executing [color = "orange", label = "controlled by FSSYNC" ];
+	Exclusive_vol_op_executing->PREATTACHED [color = "orange", label = "controlled by FSSYNC" ];
+	PREATTACHED->FREED [ color = "orange", label = "VCancelReservation_r() after a\nVDetach() or FreeVolume() will\ncause CheckDetach() or CheckFree() to fire" ];
+	PREATTACHED->ATTACHING [ color = "blue", weight = "8" ];
+	SALVAGING->PREATTACHED [ label = "controlled via FSSYNC" ];	
+
+	DETACHING->FREED ;
+	SHUTTING_DOWN->DETACHING [ color = "brown" ];
+	ATTACHED_nUsers_GT_0->SHUTTING_DOWN [ color = "orange", label = "VDetach()" ];
+
+	DETACHING->"UPDATING\nSYNCING_VOL_HDR_TO_DISK" [ color = "brown" ];
+	"UPDATING\nSYNCING_VOL_HDR_TO_DISK"->DETACHING [ color = "brown" ];
+	OFFLINING->"UPDATING\nSYNCING_VOL_HDR_TO_DISK" [ color = "green" ];
+	"UPDATING\nSYNCING_VOL_HDR_TO_DISK"->OFFLINING [ color = "green" ];
+	GOING_OFFLINE->OFFLINING [ color = "green" ];
+
+	"UPDATING\nSYNCING_VOL_HDR_TO_DISK"->SALVAGE_REQUESTED [ color = "red" ];	
+	"UPDATING\nSYNCING_VOL_HDR_TO_DISK"->ATTACHING [ color = "blue" ];	
+	ATTACHING->"UPDATING\nSYNCING_VOL_HDR_TO_DISK" [ color = "blue" ];
+
+	ATTACHED_nUsers_GT_0->GOING_OFFLINE [ color = "orange", label = "VOffline" ];	
+	ATTACHED_nUsers_GT_0->ATTACHED_nUsers_EQ_0 [ color = "orange", label = "VPut" ];	
+
+	ATTACHED_nUsers_GT_0->SALVAGE_REQUESTED [ color = "red" ];
+
+	LOADING_VNODE_BITMAPS->ATTACHING [ color = "blue" ];
+	ATTACHING->LOADING_VNODE_BITMAPS [ color = "blue" ] ;
+	LOADING_VNODE_BITMAPS->SALVAGE_REQUESTED [ color = "red" ];
+	HDR_LOADING_FROM_DISK->SALVAGE_REQUESTED [ color = "red" ];
+	HDR_LOADING_FROM_DISK->ATTACHING [ color = "blue" ] ;
+	HDR_LOADING_FROM_DISK->ATTACHED_nUsers_GT_0 [ color = "purple" ];
+
+	SALVAGE_REQUESTED->SALVAGING [ label = "controlled via FSSYNC" ];
+	SALVAGE_REQUESTED->HARD_ERROR [ color = "red", 
+		label = "After hard salvage limit reached,\n hard error state is in effect\nuntil there is operator intervention" ];
+	
+	HDR_ATTACHING_LRU_PULL->HDR_LOADING_FROM_DISK [ color = "blue" ];
+	HDR_ATTACHING_LRU_PULL->HDR_LOADING_FROM_DISK [ color = "purple" ];
+	HDR_ATTACHING_LRU_PULL->ATTACHED_nUsers_GT_0 [ color = "purple", label = "header can be in LRU\nand not have been reclaimed\nthus skipping disk I/O" ];
+
+	ATTACHING->HDR_ATTACHING_LRU_PULL [ color = "blue" ];
+	ATTACHING->ATTACHED_nUsers_EQ_0 [ color = "blue" ];
+
+	ATTACHING->SALVAGE_REQUESTED [ color = "red" ];
+	ATTACHED_nUsers_EQ_0->HDR_ATTACHING_LRU_PULL [ color = "purple" ];
+	
+	ATTACHED_nUsers_EQ_0->SALVAGE_REQUESTED [ color = "red" ];
+
+	// Various loopback transitions
+	GOING_OFFLINE->GOING_OFFLINE [ label = "VPut when (nUsers > 1)" ];
+	SHUTTING_DOWN->SHUTTING_DOWN 
+		[ label = "VPut when ((nUsers > 1) ||\n((nUsers == 1) && (nWaiters > 0)))" ];
+	SHUTTING_DOWN->SHUTTING_DOWN
+		[ label = "VCancelReservation_r when ((nWaiters > 1)\n|| ((nWaiters == 1) && (nUsers > 0)))"];	
+} 
diff --git a/doc/man-pages/pod8/bos_create.pod b/doc/man-pages/pod8/bos_create.pod
index 4c3cc5c12..1f1001ad0 100644
--- a/doc/man-pages/pod8/bos_create.pod
+++ b/doc/man-pages/pod8/bos_create.pod
@@ -45,8 +45,9 @@ fully-qualified or abbreviated unambiguously). For details, see L<bos(8)>.
 
 Names the process to define and start. Any name is acceptable, but for the
 sake of simplicity it is best to use the last element of the process's
-binary file pathname, and to use the same name on every server
-machine. The conventional names, as used in all AFS documentation, are:
+binary file pathname (or the instance type for B<fs> and B<dafs>), and to
+use the same name on every server machine. The conventional names, as used
+in all AFS documentation, are:
 
 =over 4
 
@@ -54,6 +55,12 @@ machine. The conventional names, as used in all AFS documentation, are:
 
 The Backup Server process.
 
+=item dafs
+
+The process that combines the Demand Attach File Server, Volume Server, 
+Salvageserver and Salvager processes (B<fileserver>, B<volserver>, 
+B<salvageserver>, and B<salvager>).
+
 =item fs
 
 The process that combines the File Server, Volume Server, and Salvager
@@ -69,7 +76,7 @@ The Protection Server process.
 
 =item runntp
 
-The controller process for the Network Time Protocol Daemon.
+The controller process for the Network Time Protocol Daemon (obsolete).
 
 =item upclientbin
 
@@ -113,11 +120,11 @@ command.
 
 =item dafs
 
-Use this value only for the dafs process, which combines the
-File Server, Volume Server, Salvageserver, and Salvager processes in
-order to operate as a Demand Attach File Server.  If one of the
-component processes terminates, the BOS Server shuts down
-and restarts the process in the appropriate order.
+Use this value only for the dafs process, which combines the File Server,
+Volume Server, Salvageserver, and Salvager processes in order to operate
+as a Demand Attach File Server.  If one of the component processes
+terminates, the BOS Server shuts down and restarts the process in the
+appropriate order.
 
 =item fs
 
@@ -129,9 +136,9 @@ appropriate order.
 =item simple
 
 Use this value for all processes listed as acceptable values to the
-B<-instance> argument, except for the B<fs> process.  There are no
-interdependencies between simple processes, so the BOS Server can stop and
-start them independently as necessary.
+B<-instance> argument, except for the B<fs> and B<dafs> processes.  
+There are no interdependencies between simple processes, so the 
+BOS Server can stop and start them independently as necessary.
 
 =back
 
@@ -258,6 +265,13 @@ C<fs4.abc.com>. Type the command on a single line.
                 -cmd /usr/afs/bin/fileserver /usr/afs/bin/volserver \
                 /usr/afs/bin/salvager
 
+The following command creates the dafs process dafs on the machine
+C<fs4.abc.com>. Type the command on a single line.
+
+   % bos create -server fs4.abc.com -instance dafs -type dafs \
+                -cmd /usr/afs/bin/fileserver /usr/afs/bin/volserver \
+                /usr/afs/bin/salvageserver /usr/afs/bin/salvager
+
 The following command creates a cron process called C<userbackup> on the
 machine C<fs5.abc.com>, so that the BOS Server issues the indicated B<vos
 backupsys> command each day at 3:00 a.m. (the command creates a backup
@@ -383,6 +397,7 @@ L<fileserver(8)>,
 L<kaserver(8)>,
 L<ptserver(8)>,
 L<salvager(8)>,
+L<salvageserver(8)>,
 L<upclient(8)>,
 L<upserver(8)>,
 L<vlserver(8)>,
diff --git a/doc/man-pages/pod8/fileserver.pod b/doc/man-pages/pod8/fileserver.pod
index b3813eccc..19e5f5cd7 100644
--- a/doc/man-pages/pod8/fileserver.pod
+++ b/doc/man-pages/pod8/fileserver.pod
@@ -536,8 +536,8 @@ This option is only supported by the demand-attach file server.
 
 =item B<-vlruinterval <I<seconds>>
 
-The number of seconds between VLRU candidate queue scan default is 120  s.
-The second                                                              s.
+The number of seconds between VLRU candidate queue scan.  The default is
+120 seconds.
 
 This option is only supported by the demand-attach file server.
 
diff --git a/doc/man-pages/pod8/salvageserver.pod b/doc/man-pages/pod8/salvageserver.pod
new file mode 100644
index 000000000..26a1ee628
--- /dev/null
+++ b/doc/man-pages/pod8/salvageserver.pod
@@ -0,0 +1,328 @@
+=head1 NAME
+
+salvageserver - Initializes the Salvageserver component of the dafs process
+
+=head1 SYNOPSIS
+
+=for html
+<div class="synopsis">
+
+B<salvageserver> [I<initcmd>] S<<< [B<-partition> <I<name of partition to salvage>>] >>>
+    S<<< [B<-volumeid> <I<volume id to salvage>>] >>> [B<-debug>] [B<-nowrite>]
+    [B<-inodes>] [B<-force>] [B<-oktozap>] [B<-rootinodes>]
+    [B<-salvagedirs>] [B<-blockreads>]
+    S<<< [B<-parallel> <I<# of max parallel partition salvaging>>] >>>
+    S<<< [B<-tmpdir> <I<name of dir to place tmp files>>] >>>
+    [B<-showlog>] [B<-showsuid>] [B<-showmounts>]
+    S<<< [B<-orphans> (ignore | remove | attach)] >>>
+    [B<-client>] [B<-help>]
+
+=for html
+</div>
+
+=head1 DESCRIPTION
+
+In its typical mode of operation, the B<salvageserver> is a daemon process 
+responsible for salvaging volumes.  It is a component of the C<dafs> 
+process type.  In the conventional configuration, its binary file is 
+located in the F</usr/afs/bin> directory on a file server machine.
+
+The Salvageserver daemon is responsible for scheduling and executing 
+volume salvage operations on behalf of client processes.  The fileserver 
+acts as the primary salvageserver client: any failed volume attach 
+operation results in a salvageserver scheduling request.  The 
+salvageserver also accepts periodic volume activity messages in order to 
+update its salvage request priority queue.  Other clients of the 
+salvageserver daemon include the B<salvsync-debug> utility, and the
+salvageserver command itself by passing the B<-client> flag.
+
+The salvage operations performed on vice partition data are nearly 
+identical to those performed by the standalone Salvager command.  The 
+key differences between the two commands are:
+
+=over 4
+
+=item *
+
+The Salvageserver is a daemon process which runs concurrently with the 
+fileserver.  In contrast, the Salvager is a stand-alone application which 
+is invoked when the fileserver and volserver are not running.
+
+=item *
+
+The Salvageserver is incapable of performing whole partition salvage 
+operations; it operates at volume group granularity.
+
+=back
+
+The Salvageserver normally creates new inodes as it repairs damage. If the
+partition is so full that there is no room for new inodes, use the
+B<-nowrite> argument to bringing undamaged volumes online without
+attempting to salvage damaged volumes. Then use the B<vos move> command to
+move one or more of the undamaged volumes to other partitions, freeing up
+the space that the Salvageserver needs to create new inodes.
+
+By default, multiple Salvageserver subprocesses run in parallel: one for each 
+volume group.  By default, four concurrent salvage operations are 
+permitted.  You may alter this default by providing a positive integer 
+value for the B<-parallel> argument.  The maximum permitted value is 32 
+concurrent salvageserver subprocesses.
+
+By default, the salvageserver enables a heuristic which attempts to stop
+disk head thrashing by concurrent salvageserver subprocesses.  Unfortunately,
+this heuristic significantly degrades performance in many cases.  In at least 
+the following environments, passing the C<all> string to the B<-parallel> 
+argument is strongly encouraged:
+
+=over 4
+
+=item *
+
+On NAMEI fileservers
+
+=item *
+
+When a vice partition is backed by multiple disks (e.g. RAID)
+
+=item *
+
+When a vice partition is backed by SAN-attached storage, LVM, or some other
+form of storage virtualization which would cause unix device id numbers to
+be unpredictable.
+
+=back
+
+The Salvageserver creates temporary files as it runs, by default writing them
+to the partition it is salvaging. The number of files can be quite large,
+and if the partition is too full to accommodate them, the Salvageserver
+terminates without completing the salvage operation (it always removes the
+temporary files before exiting). Other Salvageserver subprocesses running at
+the same time continue until they finish salvaging all other partitions
+where there is enough disk space for temporary files. To complete the
+interrupted salvage, reissue the command against the appropriate
+partitions, adding the B<-tmpdir> argument to redirect the temporary files
+to a local disk directory that has enough space.
+
+The B<-orphans> argument controls how the Salvageserver handles orphaned files
+and directories that it finds on server partitions it is salvaging. An
+I<orphaned> element is completely inaccessible because it is not
+referenced by the vnode of any directory that can act as its parent (is
+higher in the filespace). Orphaned objects occupy space on the server
+partition, but do not count against the volume's quota.
+
+To generate a list of all mount points that reside in one or more volumes,
+rather than actually salvaging them, include the B<-showmounts> flag.
+
+This command does not use the syntax conventions of the AFS command
+suites. Provide the command name and all option names in full.
+
+=head1 OPTIONS
+
+=over 4
+
+=item [I<initcmd>]
+
+Accommodates the command's use of the AFS command parser, and is optional.
+
+=item B<-partition> <I<name of partition to salvage>>
+
+Specifies the name of the partition to salvage. Specify the full partition
+name using the form F</vicepI<x>> or F</vicepI<xx>>. Omit this argument to
+salvage every partition on the file server machine.
+
+=item B<-volumeid> <I<volume id to salvage>>
+
+Specifies the volume ID of a specific read/write volume to salvage.  The
+B<-partition> argument must be provided along with this one and specify
+the volume's actual site.
+
+=item B<-debug>
+
+This flag should be considered deprecated.  Its primary purpose was to disable
+forking and parallelization of the Salvager so that log messages were not
+interleaved.  Due to the manner in which F</usr/afs/logs/SalSrvLog> is 
+written, log messages from subprocesses are never interleaved; the entire log
+for a volume group salvage is appended to the master log as one atomic 
+transaction.
+
+=item B<-nowrite>
+
+Brings all undamaged volumes online without attempting to salvage any
+damaged volumes.
+
+=item B<-inodes>
+
+Records in the F</usr/afs/logs/SalSrvLog> file a list of all AFS inodes
+that the Salvageserver modified.
+
+=item B<-force>
+
+Inspects all volumes for corruption, not just those that are marked as
+having been active when a crash occurred.
+
+=item B<-oktozap>
+
+Removes a volume that is so damaged that even issuing the B<vos zap>
+command with the B<-force> flag is ineffective. Use this argument only in
+consultation with AFS Development or Product Support. Combine it with the
+B<-partition> and B<-volumeid> arguments to identify the volume to remove.
+
+=item B<-rootinodes>
+
+Records in the F</usr/afs/logs/SalSrvLog> file a list of all AFS inodes
+owned by the local superuser C<root>.
+
+=item B<-salvagedirs>
+
+Salvages entire directory structures, even if they do not appear to be
+damaged. By default, the Salvageserver salvages a directory only if it is
+flagged as corrupted.
+
+=item B<-blockreads>
+
+Forces the Salvageserver to read a partition one disk block (512 bytes) at a
+time and to skip any blocks that are too badly damaged to be salvaged.
+This allows it to salvage as many volumes as possible. By default, the
+Salvageserver reads large disk blocks, which can cause it to exit prematurely
+if it encounters disk errors. Use this flag if the partition to be
+salvaged has disk errors.
+
+=item B<-parallel> <I<# of max parallel partition salvaging>>
+
+Specifies the maximum number of Salvageserver subprocesses to run in parallel.
+Provide one of three values:
+
+=over 4
+
+=item *
+
+An integer from the range C<1> to C<32>. A value of C<1> means that a
+single Salvageserver subprocess salvages the volume groups sequentially.
+The disk partition heuristic (see above) based upon unix device ids is 
+enabled.
+
+=item *
+
+The disk partition heuristic (see above) based upon unix device ids is 
+disabled.
+
+=item *
+
+The string C<all> followed immediately (with no intervening space) by an
+integer from the range C<1> to C<32>, to run the specified number of
+Salvageserver subprocesses in parallel on volume groups.  The disk partition 
+heuristic (see above) based upon unix device ids is disabled.
+
+=back
+
+If this argument is omitted, up to four Salvageserver subprocesses run
+in parallel.
+
+=item B<-tmpdir> <I<name of dir to place tmp files>>
+
+Names a local disk directory in which the Salvageserver places the temporary
+files it creates during a salvage operation, instead of writing them to
+the partition being salvaged (the default). If the Salvageserver cannot write
+to the specified directory, it attempts to write to the partition being
+salvaged.
+
+=item B<-showlog>
+
+Displays on the standard output stream all log data that is being written
+to the F</usr/afs/logs/SalSrvLog> file.
+
+=item B<-showsuid>
+
+Displays a list of the pathnames for all files that have the setuid or
+setgid mode bit set.
+
+=item B<-showmounts>
+
+Records in the F</usr/afs/logs/SalSrvLog> file all mount points found in
+each volume. The Salvageserver does not repair corruption in the volumes, if
+any exists.
+
+=item B<-orphans> (ignore | remove | attach)
+
+Controls how the Salvageserver handles orphaned files and directories.  Choose
+one of the following three values:
+
+=over 4
+
+=item ignore
+
+Leaves the orphaned objects on the disk, but prints a message to the
+F</usr/afs/logs/SalSrvLog> file reporting how many orphans were found and
+the approximate number of kilobytes they are consuming. This is the
+default if the B<-orphans> argument is omitted.
+
+=item remove
+
+Removes the orphaned objects, and prints a message to the
+F</usr/afs/logs/SalSrvLog> file reporting how many orphans were removed
+and the approximate number of kilobytes they were consuming.
+
+=item attach
+
+Attaches the orphaned objects by creating a reference to them in the vnode
+of the volume's root directory. Since each object's actual name is now
+lost, the Salvageserver assigns each one a name of the following form:
+
+=over 4
+
+=item C<__ORPHANFILE__.I<index>> for files.
+
+=item C<__ORPHANDIR__.I<index>> for directories.
+
+=back
+
+where I<index> is a two-digit number that uniquely identifies each
+object. The orphans are charged against the volume's quota and appear in
+the output of the B<ls> command issued against the volume's root
+directory.
+
+=back
+
+=item B<-client>
+
+Salvageserver runs in client Mode.  The requested volume on the requested
+partition will be scheduled for salvaging by the Salvageserver daemon.
+
+=item B<-help>
+
+Prints the online help for this command. All other valid options are
+ignored.
+
+=back
+
+=head1 EXAMPLES
+
+The following command instructs the Salvageserver to schedule the salvage 
+of the volume with volume ID 258347486 on F</vicepg> on the local machine.
+
+   % /usr/afs/bin/salvageserver -partition /vicepg -volumeid 258347486 -client
+
+=head1 PRIVILEGE REQUIRED
+
+To issue the command at the shell prompt, the issuer must be logged in as
+the local superuser C<root>.
+
+=head1 SEE ALSO
+
+L<BosConfig(5)>,
+L<SalvageLog(5)>,
+L<Salvager(8)>,
+L<bos_create(8)>,
+L<bos_getlog(8)>,
+L<bos_salvage(8)>,
+L<vos_move(1)>
+
+=head1 COPYRIGHT
+
+IBM Corporation 2000. <http://www.ibm.com/> All Rights Reserved.
+Sine Nomine Associates 2008.  All Rights Reserved.
+
+This documentation is covered by the IBM Public License Version 1.0.  It was
+converted from HTML to POD by software written by Chas Williams and Russ
+Allbery, based on work by Alf Wachsmann and Elizabeth Cassell.  This document
+was adapted from the Salvager POD documentation.