#!/bin/perl
#
# NAME:
#	cmt2doc - extract documentation from source
#
# SYNOPSIS:
#	cmt2doc [-pamit][-e "oext"][-S "secn"][-D "secd"][-O "org"]
#		[-L "lang"][-C "cmt"][-E "ecmt"] "file"
#
# DESCRIPTION:
#	This 'Perl' script extracts documentation from comments	in
#	source files.  It allows manual pages to be written in ``plain
#	text'' in source files where they are most likely to be updated
#	when the source code is.
#
#	'cmt2doc' extracts the documentation as either clean text or as
#	input suitable for 'troff'(1) and friends.   The results in
#	either case are usually quite adequate.  Try the following
#	commands:
#.nf
#
#		'perl cmt2doc.pl -p cmt2doc.pl | more'
#		'perl cmt2doc.pl -pm cmt2doc.pl | nroff -man | more'
#.fi
#
#	'cmt2doc' can usually work out for itself how to extract the
#	text from a comment.  It looks for the regular expression 
#	'.* NAME:$' which it treats as the start of a manual page, and
#	uses what ever is found before 'NAME' as the characters to
#	remove from the start of each line. 
#
#	Typographical conventions:
#.nf
#		Words like \'this word\' will be type-set in 'bold'.
#		Words like \"this word\" will be set in "italics".
#		Words like ``this quote'' will not be touched.
#.fi
#
#	It is possible to put 'troff' commands at the start of an
#	otherwise blank line.  Indeed they are sometimes needed such as
#	when setting out examples.  They will be stripped if not
#	generating 'troff' output.  
#
#	'cmt2doc' understands the format required for most manual page
#	sections and attempts to set them appropriately.
#
# OPTIONS:
#	-p	print to stdout.  By default documentation for
#		"file" will be printed to a file in the current
#		directory of the same name but with an extention
#		that represents the format (.doc,.man,.tex). 
#
#	-a	print all documentation, not just the top level.
#
#	-m	Output for 'troff -man'.
#
#	-i	Output for texinfo (no yet implemented).
#
#	-t	``Plain text'' strip single quotes.  Leave double quotes
#		alone though.
#
#	-e "oext"
#		Use "oext" as the extension for the output file.
#
#	-S "secn"
#		Tell [nt]roff which section the man page belongs in.
#		Default is 'L'.
#
#	-L "lang"
#		Select default values for "cmt" and "ecmt" based on
#		"lang" ('c','c++','lisp').  Most shell like languages
#		such as 'perl' and 'sh' are easily handled by the
#		defaults.
#
#	-D "secd"
#		Use "secd" as the section description.
#
#	-C "cmt"
#		Assume the comment lines start with "cmt".  Otherwise
#		we attempt to work it out either based on the file
#		extention (.c,.h,.cc etc) or from the comment itself.
#
#	-E "ecmt"
#		The comment ends when we see "ecmt" otherwise the first
#		line that does not start with "cmt".
#
#	-O "org"
#		Use "org" as the organization identifier (printed bottom
#		left of each page).
#
#	Some options only apply to certain output modes.
#
# FILES:
#	/usr/bin/perl		The perl interpreter.  This entry
#				is really just to show how 'cmt2doc'
#				handles the 'FILES' section.
#	/local/bin/cmt2doc.pl	This script. "ditto".
#
# BUGS:
#	It probably does not handle nested quotes correctly.
#	Lines starting with a \'.\' are in trouble.
#	For good results it is hard to avoid using 'troff' commands,
#	particularly '.nf' and '.fi'.
#	
#	Handling of '.TH' seems to vary with different man macro sets.
#	You may have to hack 'man_init' to get good results.
#

#
# RCSid:
#	$Id: cmt2doc.pl,v 1.7 1992/06/05 12:39:07 sjg Exp $
#
#	@(#)Copyright (c) 1992, Simon J. Gerraty
#
#	This file is provided in the hope that it will
#	be of use.  There is absolutely NO WARRANTY.
#	Permission to copy, redistribute or otherwise
#	use this file is hereby granted provided that 
#	the above copyright notice and this notice are
#	left intact. 
#      
#	Please send copies of changes and bug-fixes to:
#	sjg@zen.void.oz.au
#

$Myname=$0;
$Myname=~ s#^.*/([^/]*)$#$1#;

# some defaults
$do_init='txt_init';
$do_fini='noop';
$do_sec='txt_sec';
$do_para='noop';
$do_line='txt_line';

$man_secn='1';			# local commands
$oext='.doc';
$Debug = 0;
$start_para='';
$indent=0;
$defPD='.8v';

$date=&get_date;
$org='Iona Technologies';		# be sure to set this!

require 'getopts.pl';
do Getopts('dpamite:L:S:C:E:D:O:');

$org=$opt_O if defined($opt_O);
$cmt=$opt_C if defined($opt_C);
$ecmt=$opt_E if defined($opt_E);
# redefine the necessary functions
if (defined($opt_m)) {	# [tn]roff -man
  $oext = '.man';
  $do_init='man_init';
  $do_para='man_para';
  $do_sec='man_sec';
  $do_line='man_line';
} elsif (defined($opt_i)) {	# texinfo
  $oext = '.tex';
  $do_init='texi_init';
  $do_fini='texi_fini';
  $do_sec='texi_sec';
  $do_line='texi_line';
}
$man_secn=$opt_S if defined($opt_S);
if (defined($opt_D)) {
  $man_secd=$opt_D;
} else {
  $man_secd=&lookup_mansec($man_secn);
}
$oext=$opt_e if defined($opt_e);
$Debug = 1 if defined($opt_d);
$Lang=$opt_L if defined($opt_L);


$indoc=0;
$in_para = 0;

FILE: foreach $file (@ARGV) {
  print STDERR "doing $file\n" if $Debug > 0;
  $name="./$file";
  $name=~s#^.*/([^/]*)$#$1#;
  $ext=$name;
  $ext=~s/.*(\.[^.]*)$/\1/;

  if (!defined($opt_L)) {
    $Lang='c' if ($ext =~ m/\.[ch]$/);
    $Lang='c++' if ($ext =~ m/\.(cc|C|H)$/);
    $Lang='lisp' if ($ext =~ m/\.el$/);
  }
  if (defined($Lang)) {
    if ($Lang eq 'c') {
      $cmt = '[/ ]\*';
      $ecmt = ' *\*/';
    } elsif ($Lang eq 'c++') {
      $cmt = '(//|[/ ]\*)';
      $ecmt = ' *\*/';
    } elsif ($Lang eq 'lisp') {
      $cmt = ';+';
    }
  }
  if (!defined($opt_p)) {
    $ofile = $name;		# we've already stripped dirname
    $ofile =~ s#\.[^/.]+$##;
    $ofile .= $oext;
    print STDERR "Output to $ofile\n" if $Debug > 0;
    open(STDOUT, "> $ofile") || die "can't redirect STDOUT: $!\n";
  }
  if (!open(F, "< $file")) {
    print STDERR "can't open $file: $!\n";
    next FILE;
  }
  LINE: while (<F>) {
    chop;
    if ($indoc == 0 && m/ ?NAME:$/) {
      if (!defined($cmt)) {
	$cmt = $_;
	$cmt =~ s/^(.*) NAME.*/\1/;
      }
      $indoc = 1;
      $in_para = 0;
      &$do_init;
    }
    next if ($indoc == 0);
    # we are inside doc section
    if ($_ !~ m@^$cmt@ || (defined($ecmt) && $_ =~ m@^$ecmt@)) {
      $indoc = 0;
      &$do_fini;
      if (defined($opt_a)) {
	next LINE;
      } else {
	next FILE;
      }
    }
    s@^$cmt ?@@;
    $needout = 1;
    if (m/^[A-Z][A-Za-z _-]+:$/) {
      &$do_sec;
    } elsif (m/^[ \t]*$/) {
      $in_para = 0;
      if (defined($opt_m)) {
	$needout = 0;
      }
    } else {
      if ($in_para == 0) {
	$in_para = 1;
	&$do_para;
      }
      &$do_line;
    }
    print "$_\n" if ($needout > 0);
  }
  close F;
}
exit 0;

# for plain text these are noops
sub noop {
}

sub txt_init {
  local($i,$c);
  $llength = 65;
  $c = 0;
  
  $nm=$name;
  $nm=~s/\.[^.]*$//;
  $nm =~ tr/[a-z]/[A-Z]/;
  $nm = "$nm($man_secn)";
  print "\n$nm";
  $c += length($nm);
  $i = int(($llength - length($man_secd))/ 2);
  while ($c < $i) {
    $c++;
    print " ";
  }
  print "$man_secd";
  $c += length($man_secd);
  $i = $llength - length($nm);
  while ($c < $i) {
    $c++;
    print " ";
  }
  print "$nm\n\n\n";
}

sub txt_sec {
  # just loose the trailing ':'
  $sec = $_;
  $sec =~ s/ *([A-Z][A-Za-z _-]*):/\1/;
  $_ = $sec;
  $in_para = 0;
}

sub txt_line {
  $needout = 0 if (m/^\.\w+/);	# strip nroff commands
  if (defined($opt_t)) {
    # strip 'word' to just word.
    # a bit of trickery to avoid ``quotes'' and \'word\'.
    s/^'([^']+)'/\1/g;	# 'bold'
    s/([^'\\])'([^']+)'/\1\2/g;	# 'bold'
  }
  s/\\(['"\\])/\1/g;		# strip \\ \' and \" to ' " and \
}


sub man_init {
  print ".\\\" extracted from $file $date by $Myname\n";
  
  $nm=$name;
  $nm=~s/\.[^.]*$//;
  $nm =~ tr/[a-z]/[A-Z]/;
  # some tmac.an macros don't support $org HP-UX for example.
  # But most do.  Just comment out setting of $org above.
  if (defined($org)) {
    print ".TH $nm $man_secn \"$date\" \"$org\" \"$man_secd\"\n";
  } else {
    print ".TH $nm $man_secn \"$date\" \"$man_secd\"\n";
  }
  # just to be sure
  print ".PD $defPD\n";
}

sub man_sec {
  &man_indent(0);		# make sure indentation is back to 0

  if ($start_para eq '.nf') {
    print ".fi\n";
  }
  if ($sec eq 'FILES') {
    # previous section was FILES
    # restore inter-paragraph distance
    print ".PD $defPD\n";
  }
  # get new section name.
  $sec = $_;
  $sec =~ s/ *([A-Z][A-Za-z _-]*):/\1/;

  if ($sec ne 'NAME') {
    print "\n";
  }
  if ($sec =~ m/ /) {
    print ".SH \"$sec\"\n";
  } else {
    print ".SH $sec\n";
  }
  if ($sec eq 'FILES') {
    # little or no gap between paragraphs.
    # so it looks like it should.
    print ".PD .1v\n";
  }
  $needout = 0;
  $in_para = 0;
}

# this gets a little messy
sub man_para {
  if (m/^\.\w+/) {
    # a [tn]roff command, next line is start of para
    $in_para=0;
    return;
  }
  if ($sec =~ m/DESCRIPTION|OPTIONS/ && m/^[ \t]*-/) {
    $start_para = '.TP';
  } elsif ($sec eq 'FILES') {
    $start_para = '.TP 30';
  } elsif ($sec =~ m/NAME|SYNOPSIS/) {
    $start_para = '.nf';
  } elsif ($start_para =~ m/\.TP/) {
    $start_para = '.PP';
  } else {
    $start_para = '';
  }
  print "$start_para\n" if ($needout > 0);
  # handle indented paras
  if ($start_para !~ m/\.TP/ && m/^\t/) {
    &man_indent(-1);
  }
}


# we have to do more that we would like here, to
# set 'bold' and "italics" but not to harm \'words\'
# \"words\" and ``quotes''.
sub man_line {
  # man_para will have been called once already
  # so first time in after a new para, $in_para==1.
  # in here we can set it to other values to indicate
  # a need to force a new para, or adjust indentation.
  if ($in_para == 3) {
    &man_indent(-1);
    $in_para=1;
  }
  if (m/^\.\w+/) {
    # a [tn]roff command
    $in_para=3;
    return;
  }
  s/^[ \t]*//;
  if ($sec eq 'FILES') {
    # we assume file descriptions are formated
    # filename\tdecription
    if (m/^[^\t]+\t+[^\t]+/) {
      if ($in_para == 2) {
        &man_para;
      }
      $in_para = 2;
      s/^[ \t]*//;
      s/^([^\t]+)\t+([^\t]+)/\1\n\2/;
    }
  } elsif ($sec =~ m/DESCRIPTION|OPTIONS/ && m/^[ \t]*-/) {
    if ($in_para == 2) {
      &man_para;
    }
    $in_para = 2;
    s/^[ \t]*//;
    s/\t/ /g;
    # format options correctly
    s/^([^'" ]+)/'\1'/ if (m/^[^'"]/);
    s/^('[^']+' *"[^"]+") *([^'" ])/\1\n\2/;
    s/^('[^']+') *([^'" ])/\1\n\2/;
  }
  s/\t/ /g;
  if ($sec eq 'SYNOPSIS') {
    s/^(\w+)/'\1'/;
    s/(-\w+)/'\1'/g if (m/\[/);
  }
  s/([ '"])-/\1\\-/g;
  s/^"([^"]+)"/\\fI\1\\fR/g;	# "italic"
  # avoid \"word\"
  s/([^\\])"([^"]*[^\\])"/\1\\fI\2\\fR/g;	# "italic"
  # a bit of trickery to avoid ``quotes'' and \'word\'.
  s/^'([^']+)'/\\fB\1\\fR/g;	# 'bold'
  s/([^'\\])'([^']+)'/\1\\fB\2\\fR/g;	# 'bold'
  # now make \['"] into just ' or "
  s/\\(['"])/\1/g;
}

# adjust the indent level
sub man_indent {
  local($i) = @_;
  local($itabs,@tabs);
  
  if ($i < 0) {
    # calculate required indent level
    $itabs=$_;
    $itabs =~ s/^(\t+)[^\t].*/\1/;

    @tabs=split(/\t/,$itabs, 10);
    $i = $#tabs - 1;
  }
  if ($i >= 0) {
    while ($indent < $i) {
      $indent++;
      print ".RS\n";
    }
    while ($indent > $i) {
      $indent--;
      print ".RE\n";
    }
  }
}


sub lookup_mansec {
  local($n) = @_;
  local($d);
  %s = &init_secd if (!defined(%s));

  $d = $s{$n};
  if (!defined($d)) {
    $d = $s{'default'};
  }
  $d;
}

sub init_secd {
  local(%s);

  $s{'default'} = 'MISC. REFERENCE MANUAL PAGES';
  $s{'1'} = 'USER COMMANDS ';
  $s{'1C'} = 'USER COMMANDS';
  $s{'1G'} = 'USER COMMANDS';
  $s{'1S'} = 'USER COMMANDS';
  $s{'1V'} = 'USER COMMANDS ';
  $s{'2'} = 'SYSTEM CALLS';
  $s{'2V'} = 'SYSTEM CALLS';
  $s{'3'} = 'C LIBRARY FUNCTIONS';
  $s{'3C'} = 'COMPATIBILITY FUNCTIONS';
  $s{'3F'} = 'FORTRAN LIBRARY ROUTINES';
  $s{'3K'} = 'KERNEL VM LIBRARY FUNCTIONS';
  $s{'3L'} = 'LIGHTWEIGHT PROCESSES LIBRARY';
  $s{'3M'} = 'MATHEMATICAL LIBRARY';
  $s{'3N'} = 'NETWORK FUNCTIONS';
  $s{'3R'} = 'RPC SERVICES LIBRARY';
  $s{'3S'} = 'STANDARD I/O FUNCTIONS';
  $s{'3V'} = 'C LIBRARY FUNCTIONS';
  $s{'3X'} = 'MISCELLANEOUS LIBRARY FUNCTIONS';
  $s{'4'} = 'DEVICES AND NETWORK INTERFACES';
  $s{'4F'} = 'PROTOCOL FAMILIES';
  $s{'4I'} = 'DEVICES AND NETWORK INTERFACES';
  $s{'4M'} = 'DEVICES AND NETWORK INTERFACES';
  $s{'4N'} = 'DEVICES AND NETWORK INTERFACES';
  $s{'4P'} = 'PROTOCOLS';
  $s{'4S'} = 'DEVICES AND NETWORK INTERFACES';
  $s{'4V'} = 'DEVICES AND NETWORK INTERFACES';
  $s{'5'} = 'FILE FORMATS';
  $s{'5V'} = 'FILE FORMATS';
  $s{'6'} = 'GAMES AND DEMOS';
  $s{'7'} = 'ENVIRONMENTS, TABLES, AND TROFF MACROS';
  $s{'7V'} = 'ENVIRONMENTS, TABLES, AND TROFF MACROS';
  $s{'8'} = 'MAINTENANCE COMMANDS';
  $s{'8C'} = 'MAINTENANCE COMMANDS';
  $s{'8S'} = 'MAINTENANCE COMMANDS';
  $s{'8V'} = 'MAINTENANCE COMMANDS';
  $s{'L'} = 'LOCAL COMMANDS';
  %s;
}

sub get_date {
  @months = ('January','February','March','April','May',
	     'June','July','August','September','October',
	     'November','December');
  ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$idst) =
    localtime(time);
  if ($year < 70) {
    $cent='20';
  } else {
    $cent = '19';
  }
  $month = $months[$mon];
  "$mday $month $cent$year";
}
