/* -*- Mode:Text -*- */
#ifndef lint
static char Rcs_Id[] =
    "$Id: lookup.c,v 1.29 91/09/11 23:22:51 geoff Exp $";
#endif

/*
 * lookup.c - see if a word appears in the dictionary
 *
 * Pace Willisson, 1983
 *
 * Copyright 1987, 1988, 1989, by Geoff Kuenning, Manhattan Beach, CA
 * Permission for non-profit use is hereby granted.
 * All other rights reserved.
 * See "version.h" for a more complete copyright notice.
 */

/*
 * $Log:	lookup.c,v $
 * Revision 1.29  91/09/11  23:22:51  geoff
 * Make sure to allocate enough space in the flag tables to allow for
 * string characters.  Make sure that "viazero" is set even if an affix
 * entry is empty.
 * 
 * Revision 1.28  91/07/11  19:52:17  geoff
 * Remove the include of stdio.h, since ispell.h now does this.
 * 
 * Revision 1.27  91/07/05  19:51:47  geoff
 * Fix some lint complaints.
 * 
 * Revision 1.26  91/05/27  21:48:03  geoff
 * Fix a bug in reading in null-affix prefix flags.  Add support for
 * reading the string character type table from the hash file.
 * 
 * Revision 1.25  91/02/26  23:24:04  geoff
 * If the options are no good, abort the run
 * 
 * Revision 1.24  90/12/31  00:59:22  geoff
 * Reformat to follow a consistent convention throughout ispell
 * 
 * Revision 1.23  90/04/26  22:44:11  geoff
 * Add the canonicalize parameter to the call to ichartostr.
 * 
 * Revision 1.22  90/04/17  15:33:08  geoff
 * Use explicit masking to prevent sign extension when checking the
 * magic number in the hash header.  Fix a place where the affix length
 * was being set to null when it should have been the affix ponter.
 * 
 * Revision 1.21  89/12/27  03:18:06  geoff
 * Move all messages to msgs.h so they can be reconfigured
 * 
 * Revision 1.20  89/06/09  15:55:12  geoff
 * Add support for the internal "character" type, ichar_t.
 * 
 * Revision 1.19  89/04/28  01:11:51  geoff
 * Change Header to Id;  nobody cares about my pathnames.
 * 
 * Revision 1.18  89/04/03  01:56:54  geoff
 * Fix a bunch of lint complaints.  Add support for keeping the compilation
 * options separate from the magic number.
 * 
 * Revision 1.17  88/12/26  02:30:25  geoff
 * Add a copyright notice.
 * 
 * Revision 1.16  88/11/16  02:19:31  geoff
 * Improve the quality of the error message issued when linit() fails to
 * initialize the hash table.
 * 
 * Revision 1.15  88/04/30  22:14:31  geoff
 * Fix some lint complaints.
 * 
 * Revision 1.14  88/03/27  01:01:44  geoff
 * Add a missing #ifdef on CAPITALIZATION
 * 
 * Revision 1.13  88/02/20  23:13:14  geoff
 * Many changes to support the new capitalization handling data structures.
 * 
 * Revision 1.12  87/09/30  23:31:04  geoff
 * Move some globals to ispell.h.
 * 
 * Revision 1.11  87/07/20  23:22:58  geoff
 * Enhance tinit to honor nodictflag and to read in the language tables.
 * 
 * Revision 1.10  87/04/19  22:53:08  geoff
 * Make hashheader a global, and rearrange the includes of ispell.h
 * and config.h.
 * 
 * Revision 1.9  87/03/26  00:30:40  geoff
 * Integrate Rich Salz's changes/improvements
 * 
 * Revision 1.8  87/03/10  23:33:03  geoff
 * Add code to exit gracefully if there isn't space for the hash table,
 * instead of just core dumping.
 * 
 * Revision 1.7  87/03/08  20:31:06  geoff
 * Accept an empty hash file as legal, and create a dummy null table
 * in that case.  Add the "dotree" argument to lookup.
 * 
 * Revision 1.6  87/03/05  22:56:02  geoff
 * Only set lastdent if something is found
 * 
 * Revision 1.5  87/03/01  00:56:59  geoff
 * Get rid of the cflag stuff,  it's now wrong.  Add code to convert all
 * the indexes in the hash table into pointers, so the loops don't have to.
 * Modify the rest of the code to expect pointers.  Be sure to set lastdent
 * if treelookup succeeds.
 * 
 * Revision 1.4  87/02/28  14:58:32  geoff
 * If lookup fails, try treelookup instead.  Also, if cflag is set, don't
 * read in the hash database.
 * 
 * Revision 1.3  87/02/26  00:26:59  geoff
 * Integrate McQueer's enhancements into the main branch
 * 
 * Revision 1.2  87/01/17  13:12:00  geoff
 * Add RCS ID keywords
 * 
 */

#include "config.h"
#include "ispell.h"
#include "msgs.h"


struct dent *	treelookup ();

#ifndef linux
extern char *	calloc ();
extern char *	malloc ();
extern long	lseek ();
#endif

static		inited = 0;

linit ()
    {
    int			hashfd;
    register int	i;
    register struct dent * dp;
    struct flagent *	entry;
    struct flagptr *	ind;
    int			nextchar;
    int			viazero;
    register ichar_t *	cp;

    if (inited)
	return 0;

    if ((hashfd = open (hashname, 0)) < 0)
	{
	(void) fprintf (stderr, CANT_OPEN, hashname);
	return (-1);
	}

    hashsize = read (hashfd, (char *) &hashheader, sizeof hashheader);
    if (hashsize < sizeof hashheader)
	{
	if (hashsize < 0)
	    (void) fprintf (stderr, LOOKUP_C_CANT_READ, hashname);
	else if (hashsize == 0)
	    (void) fprintf (stderr, LOOKUP_C_NULL_HASH, hashname);
	else
	    (void) fprintf (stderr,
	      LOOKUP_C_SHORT_HASH (hashname, hashsize, sizeof hashheader));
	return (-1);
	}
    else if ((hashheader.magic & 0xFFFFL) != (long) MAGIC)
	{
	(void) fprintf (stderr,
	  LOOKUP_C_BAD_MAGIC (hashname, MAGIC, hashheader.magic));
	return (-1);
	}
    else if (hashheader.compileoptions != COMPILEOPTIONS
      ||  hashheader.maxstringchars != MAXSTRINGCHARS
      ||  hashheader.maxstringcharlen != MAXSTRINGCHARLEN)
	{
	(void) fprintf (stderr,
	  LOOKUP_C_BAD_OPTIONS (hashheader.compileoptions,
	    hashheader.maxstringchars, hashheader.maxstringcharlen,
	  COMPILEOPTIONS, MAXSTRINGCHARS, MAXSTRINGCHARLEN));
	return (-1);
	}
    if (nodictflag)
	{
	/*
	 * Dictionary is not needed - create an empty dummy table.  We
	 * actually have to have one entry since the hash
	 * algorithm involves a divide by the table size
	 * (actually modulo, but zero is still unacceptable).
	 * So we create an empty entry.
	 */
	hashsize = 1;		/* This prevents divides by zero */
	hashtbl = (struct dent *) calloc (1, sizeof (struct dent));
	if (hashtbl == NULL)
	    {
	    (void) fprintf (stderr, LOOKUP_C_NO_HASH_SPACE);
	    return (-1);
	    }
	hashtbl[0].word = NULL;
	hashtbl[0].next = NULL;
	hashtbl[0].flagfield &= ~(USED | KEEP);
	/* The flag bits don't matter, but calloc cleared them. */
	hashstrings = (char *) malloc ((unsigned) hashheader.lstringsize);
	}
    else
	{
	hashtbl =
	 (struct dent *)
	    malloc ((unsigned) hashheader.tblsize * sizeof (struct dent));
	hashsize = hashheader.tblsize;
	hashstrings = (char *) malloc ((unsigned) hashheader.stringsize);
	}
    numsflags = hashheader.stblsize;
    numpflags = hashheader.ptblsize;
    sflaglist = (struct flagent *)
      malloc ((numsflags + numpflags) * sizeof (struct flagent));
    if (hashtbl == NULL  ||  hashstrings == NULL  ||  sflaglist == NULL)
	{
	(void) fprintf (stderr, LOOKUP_C_NO_HASH_SPACE);
	return (-1);
	}
    pflaglist = sflaglist + numsflags;

    if (nodictflag)
	{
	/*
	 * Read just the strings for the language table, and
	 * skip over the rest of the strings and all of the
	 * hash table.
	 */
	if (read (hashfd, hashstrings, (unsigned) hashheader.lstringsize)
	  != hashheader.lstringsize)
	    {
	    (void) fprintf (stderr, LOOKUP_C_BAD_FORMAT);
	    return (-1);
	    }
	(void) lseek (hashfd,
	  (long) hashheader.stringsize - (long) hashheader.lstringsize
	    + (long) hashheader.tblsize * (long) sizeof (struct dent),
	  1);
	}
    else
	{
	if (read (hashfd, hashstrings, (unsigned) hashheader.stringsize)
	    != hashheader.stringsize
	  ||  read (hashfd, (char *) hashtbl,
	      (unsigned) hashheader.tblsize * sizeof (struct dent))
	    != hashheader.tblsize * sizeof (struct dent))
	    {
	    (void) fprintf (stderr, LOOKUP_C_BAD_FORMAT);
	    return (-1);
	    }
	}
    if (read (hashfd, (char *) sflaglist,
	(unsigned) (numsflags + numpflags) * sizeof (struct flagent))
      != (numsflags + numpflags) * sizeof (struct flagent))
	{
	(void) fprintf (stderr, LOOKUP_C_BAD_FORMAT);
	return (-1);
	}
    (void) close (hashfd);

    if (!nodictflag)
	{
	for (i = hashsize, dp = hashtbl;  --i >= 0;  dp++)
	    {
	    if (dp->word == (char *) -1)
		dp->word = NULL;
	    else
		dp->word = &hashstrings [ (int)(dp->word) ];
	    if (dp->next == (struct dent *) -1)
		dp->next = NULL;
	    else
		dp->next = &hashtbl [ (int)(dp->next) ];
	    }
	}

    for (i = numsflags + numpflags, entry = sflaglist; --i >= 0; entry++)
	{
	if (entry->stripl)
	    entry->strip = (ichar_t *) &hashstrings[(int) entry->strip];
	else
	    entry->strip = NULL;
	if (entry->affl)
	    entry->affix = (ichar_t *) &hashstrings[(int) entry->affix];
	else
	    entry->affix = NULL;
	}
    /*
    ** Warning - 'entry' and 'i' are reset in the body of the loop
    ** below.  Don't try to optimize it by (e.g.) moving the decrement
    ** of i into the loop condition.
    */
    for (i = numsflags, entry = sflaglist;  i > 0;  i--, entry++)
	{
	if (entry->affl == 0)
	    {
	    cp = NULL;
	    ind = &sflagindex[0];
	    viazero = 1;
	    }
	else
	    {
	    cp = entry->affix + entry->affl - 1;
	    ind = &sflagindex[*cp];
	    viazero = 0;
	    while (ind->numents == 0  &&  ind->pu.ent != NULL)
		{
		if (cp == entry->affix)
		    {
		    ind = &ind->pu.fp[0];
		    viazero = 1;
		    }
		else
		    {
		    ind = &ind->pu.fp[*--cp];
		    viazero = 0;
		    }
		}
	    }
	if (ind->numents == 0)
	    ind->pu.ent = entry;
	ind->numents++;
	/*
	** If this index entry has more than MAXSEARCH flags in
	** it, we will split it into subentries to reduce the
	** searching.  However, the split doesn't make sense in
	** two cases:  (a) if we are already at the end of the
	** current affix, or (b) if all the entries in the list
	** have identical affixes.  Since the list is sorted, (b)
	** is true if the first and last affixes in the list
	** are identical.
	*/
	if (!viazero  &&  ind->numents >= MAXSEARCH
	  &&  icharcmp (entry->affix, ind->pu.ent->affix) != 0)
	    {
	    /* Sneaky trick:  back up and reprocess */
	    entry = ind->pu.ent - 1; /* -1 is for entry++ in loop */
	    i = numsflags - (entry - sflaglist);
	    ind->pu.fp =
	      (struct flagptr *) calloc (SET_SIZE + MAXSTRINGCHARS,
	        sizeof (struct flagptr));
	    if (ind->pu.fp == NULL)
		{
		(void) fprintf (stderr, LOOKUP_C_NO_LANG_SPACE);
		return (-1);
		}
	    ind->numents = 0;
	    }
	}
    /*
    ** Warning - 'entry' and 'i' are reset in the body of the loop
    ** below.  Don't try to optimize it by (e.g.) moving the decrement
    ** of i into the loop condition.
    */
    for (i = numpflags, entry = pflaglist;  i > 0;  i--, entry++)
	{
	if (entry->affl == 0)
	    {
	    cp = NULL;
	    ind = &pflagindex[0];
	    viazero = 1;
	    }
	else
	    {
	    cp = entry->affix;
	    ind = &pflagindex[*cp];
	    while (ind->numents == 0  &&  ind->pu.ent != NULL)
		{
		if (*cp == 0)
		    {
		    ind = &ind->pu.fp[0];
		    viazero = 1;
		    }
		else
		    {
		    ind = &ind->pu.fp[*cp++];
		    viazero = 0;
		    }
		}
	    }
	if (ind->numents == 0)
	    ind->pu.ent = entry;
	ind->numents++;
	/*
	** If this index entry has more than MAXSEARCH flags in
	** it, we will split it into subentries to reduce the
	** searching.  However, the split doesn't make sense in
	** two cases:  (a) if we are already at the end of the
	** current affix, or (b) if all the entries in the list
	** have identical affixes.  Since the list is sorted, (b)
	** is true if the first and last affixes in the list
	** are identical.
	*/
	if (!viazero  &&  ind->numents >= MAXSEARCH
	  &&  icharcmp (entry->affix, ind->pu.ent->affix) != 0)
	    {
	    /* Sneaky trick:  back up and reprocess */
	    entry = ind->pu.ent - 1; /* -1 is for entry++ in loop */
	    i = numpflags - (entry - pflaglist);
	    ind->pu.fp =
	      (struct flagptr *) calloc (SET_SIZE + MAXSTRINGCHARS,
	        sizeof (struct flagptr));
	    if (ind->pu.fp == NULL)
		{
		(void) fprintf (stderr, LOOKUP_C_NO_LANG_SPACE);
		return (-1);
		}
	    ind->numents = 0;
	    }
	}
#ifdef INDEXDUMP
    (void) fprintf (stderr, "Prefix index table:\n");
    dumpindex (pflagindex, 0);
    (void) fprintf (stderr, "Suffix index table:\n");
    dumpindex (sflagindex, 0);
#endif
    if (hashheader.nstrchartype == 0)
	chartypes = NULL;
    else
	{
	chartypes = (struct strchartype *)
	  malloc (hashheader.nstrchartype * sizeof (struct strchartype));
	if (chartypes == NULL)
	    {
	    (void) fprintf (stderr, LOOKUP_C_NO_LANG_SPACE);
	    return (-1);
	    }
	for (i = 0, nextchar = hashheader.strtypestart;
	  i < hashheader.nstrchartype;
	  i++)
	    {
	    chartypes[i].name = &hashstrings[nextchar];
	    nextchar += strlen (chartypes[i].name) + 1;
	    chartypes[i].suffixes = &hashstrings[nextchar];
	    while (hashstrings[nextchar] != '\0')
		nextchar += strlen (&hashstrings[nextchar]) + 1;
	    nextchar++;
	    }
	}
    inited = 1;
    return (0);
    }

#ifdef INDEXDUMP
int dumpindex (indexp, depth)
    register struct flagptr *	indexp;
    register int		depth;
    {
    register int		i;
    int				j;

    for (i = 0;  i < SET_SIZE;  i++, indexp++)
	{
	if (indexp->numents == 0  &&  indexp->pu.fp != NULL)
	    {
	    for (j = depth;  --j >= 0;  )
		(void) putc (' ', stderr);
	    if (i >= ' '  &&  i < '~')
		(void) putc (i, stderr);
	    else
		(void) fprintf (stderr, "0x%x", i);
	    (void) putc ('\n', stderr);
	    dumpindex (indexp->pu.fp, depth + 1);
	    }
	else if (indexp->numents)
	    {
	    for (j = depth;  --j >= 0;  )
		(void) putc (' ', stderr);
	    if (i >= ' '  &&  i < '~')
		(void) putc (i, stderr);
	    else
		(void) fprintf (stderr, "0x%x", i);
	    if (indexp->pu.ent->stripl)
		(void) fprintf (stderr, " -> entry %d (-%s,%s), %d entries\n",
		  indexp->pu.ent - sflaglist,
		  indexp->pu.ent->strip,
		  indexp->pu.ent->affl ? indexp->pu.ent->affix : "-",
		  indexp->numents);
	    else
		(void) fprintf (stderr, " -> entry %d (%s), %d entries\n",
		  indexp->pu.ent - sflaglist,
		  indexp->pu.ent->affix, indexp->numents);
	    }
	}
    }
#endif

/* n is length of s */
struct dent * lookup (s, dotree)
    register ichar_t *		s;
    {
    register struct dent *	dp;
    register char *		s1;
    char			schar[INPUTWORDLEN + MAXAFFIXLEN];

    dp = &hashtbl[hash (s, hashsize)];
    ichartostr (schar, s, 1);
    for (  ;  dp != NULL;  dp = dp->next)
	{
	/* quick strcmp, but only for equality */
	s1 = dp->word;
	if (s1  &&  s1[0] == schar[0]  &&  strcmp (s1 + 1, schar + 1) == 0)
	    return dp;
#ifdef CAPITALIZATION
	while (dp->flagfield & MOREVARIANTS)	/* Skip variations */
	    dp = dp->next;
#endif
	}
    if (dotree)
	{
	dp = treelookup (s);
	return dp;
	}
    else
	return NULL;
    }
