/*************************************************************************************************
 * The command line utility of the q-gram database API
 *                                                               Copyright (C) 2007-2010 FAL Labs
 * This file is part of Tokyo Dystopia.
 * Tokyo Dystopia is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License or any later version.  Tokyo Dystopia is distributed in the hope
 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 * License for more details.
 * You should have received a copy of the GNU Lesser General Public License along with Tokyo
 * Dystopia; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 * Boston, MA 02111-1307 USA.
 *************************************************************************************************/


#include <tcqdb.h>
#include "myconf.h"

#define SEARCHWORDMAX  256               // maximum number of search words
#define DEFSEARCHMAX   10                // default maximum number of printed IDs

enum {                                   // enumeration for expression mode
  EMUNION,                               // union
  EMISECT,                               // intersection
  EMDIFF                                 // difference
};


/* global variables */
const char *g_progname;                  // program name
int g_dbgfd;                             // debugging output


/* function prototypes */
int main(int argc, char **argv);
static void usage(void);
static void printerr(TCQDB *qdb);
static char *mygetline(FILE *ifp);
static bool mysynccb(int total, int current, const char *msg, void *opq);
static int runcreate(int argc, char **argv);
static int runinform(int argc, char **argv);
static int runput(int argc, char **argv);
static int runout(int argc, char **argv);
static int runsearch(int argc, char **argv);
static int runoptimize(int argc, char **argv);
static int runimporttsv(int argc, char **argv);
static int runnormalize(int argc, char **argv);
static int runversion(int argc, char **argv);
static int proccreate(const char *path, int64_t etnum, int opts);
static int procinform(const char *path, int omode);
static int procput(const char *path, int64_t id, const char *text, int omode, int topts);
static int procout(const char *path, int64_t id, const char *text, int omode, int topts);
static int procsearch(const char *path, const char **words, int wnum, int omode,
                      int topts, int emode, int smode, int max, bool ph);
static int procoptimize(const char *path, int omode);
static int procimporttsv(const char *path, const char *file,
                         int64_t icsiz, int omode, int topts);
static int procnormalize(const char *text, int topts);
static int procversion(void);


/* main routine */
int main(int argc, char **argv){
  g_progname = argv[0];
  g_dbgfd = -1;
  const char *ebuf = getenv("TCDBGFD");
  if(ebuf) g_dbgfd = tcatoi(ebuf);
  if(argc < 2) usage();
  int rv = 0;
  if(!strcmp(argv[1], "create")){
    rv = runcreate(argc, argv);
  } else if(!strcmp(argv[1], "inform")){
    rv = runinform(argc, argv);
  } else if(!strcmp(argv[1], "put")){
    rv = runput(argc, argv);
  } else if(!strcmp(argv[1], "out")){
    rv = runout(argc, argv);
  } else if(!strcmp(argv[1], "search")){
    rv = runsearch(argc, argv);
  } else if(!strcmp(argv[1], "optimize")){
    rv = runoptimize(argc, argv);
  } else if(!strcmp(argv[1], "importtsv")){
    rv = runimporttsv(argc, argv);
  } else if(!strcmp(argv[1], "normalize")){
    rv = runnormalize(argc, argv);
  } else if(!strcmp(argv[1], "version") || !strcmp(argv[1], "--version")){
    rv = runversion(argc, argv);
  } else {
    usage();
  }
  return rv;
}


/* print the usage and exit */
static void usage(void){
  fprintf(stderr, "%s: the command line utility of the q-gram database API\n", g_progname);
  fprintf(stderr, "\n");
  fprintf(stderr, "usage:\n");
  fprintf(stderr, "  %s create [-tl] [-td|-tb|-tt] path [etnum]\n", g_progname);
  fprintf(stderr, "  %s inform [-nl|-nb] path\n", g_progname);
  fprintf(stderr, "  %s put [-nl|-nb] [-rc] [-ra] [-rs] path id text\n", g_progname);
  fprintf(stderr, "  %s out [-nl|-nb] [-rc] [-ra] [-rs] path id text\n", g_progname);
  fprintf(stderr, "  %s search [-nl|-nb] [-rc] [-ra] [-rs] [-eu|-ed] [-sp|-ss|-sf]"
          " [-max num] [-ph] path [word...]\n", g_progname);
  fprintf(stderr, "  %s optimize [-nl|-nb] path\n", g_progname);
  fprintf(stderr, "  %s importtsv [-ic num] [-nl|-nb] [-rc] [-ra] [-rs] path [file]\n",
          g_progname);
  fprintf(stderr, "  %s normalize [-rc] [-ra] [-rs] text\n", g_progname);
  fprintf(stderr, "  %s version\n", g_progname);
  fprintf(stderr, "\n");
  exit(1);
}


/* print error information */
static void printerr(TCQDB *qdb){
  const char *path = tcqdbpath(qdb);
  int ecode = tcqdbecode(qdb);
  fprintf(stderr, "%s: %s: %d: %s\n", g_progname, path ? path : "-", ecode, tcqdberrmsg(ecode));
}


/* read a line from a file descriptor */
static char *mygetline(FILE *ifp){
  int len = 0;
  int blen = 1024;
  char *buf = tcmalloc(blen);
  bool end = true;
  int c;
  while((c = fgetc(ifp)) != EOF){
    end = false;
    if(c == '\0') continue;
    if(blen <= len){
      blen *= 2;
      buf = tcrealloc(buf, blen + 1);
    }
    if(c == '\n' || c == '\r') c = '\0';
    buf[len++] = c;
    if(c == '\0') break;
  }
  if(end){
    tcfree(buf);
    return NULL;
  }
  buf[len] = '\0';
  return buf;
}


/* callback function for sync progression */
static bool mysynccb(int total, int current, const char *msg, void *opq){
  if(total < 10 || current % (total / 10) == 0) printf("[sync:%d:%d:%s]\n", total, current, msg);
  return true;
}


/* parse arguments of create command */
static int runcreate(int argc, char **argv){
  char *path = NULL;
  char *etstr = NULL;
  int opts = 0;
  for(int i = 2; i < argc; i++){
    if(!path && argv[i][0] == '-'){
      if(!strcmp(argv[i], "-tl")){
        opts |= QDBTLARGE;
      } else if(!strcmp(argv[i], "-td")){
        opts |= QDBTDEFLATE;
      } else if(!strcmp(argv[i], "-tb")){
        opts |= QDBTBZIP;
      } else if(!strcmp(argv[i], "-tt")){
        opts |= QDBTTCBS;
      } else {
        usage();
      }
    } else if(!path){
      path = argv[i];
    } else if(!etstr){
      etstr = argv[i];
    } else {
      usage();
    }
  }
  if(!path) usage();
  int64_t etnum = etstr ? strtoll(etstr, NULL, 10) : -1;
  int rv = proccreate(path, etnum, opts);
  return rv;
}


/* parse arguments of inform command */
static int runinform(int argc, char **argv){
  char *path = NULL;
  int omode = 0;
  for(int i = 2; i < argc; i++){
    if(!path && argv[i][0] == '-'){
      if(!strcmp(argv[i], "-nl")){
        omode |= QDBONOLCK;
      } else if(!strcmp(argv[i], "-nb")){
        omode |= QDBOLCKNB;
      } else {
        usage();
      }
    } else if(!path){
      path = argv[i];
    } else {
      usage();
    }
  }
  if(!path) usage();
  int rv = procinform(path, omode);
  return rv;
}


/* parse arguments of put command */
static int runput(int argc, char **argv){
  char *path = NULL;
  char *idstr = NULL;
  char *text = NULL;
  int omode = 0;
  int topts = TCTNLOWER | TCTNNOACC | TCTNSPACE;
  for(int i = 2; i < argc; i++){
    if(!path && argv[i][0] == '-'){
      if(!strcmp(argv[i], "-nl")){
        omode |= QDBONOLCK;
      } else if(!strcmp(argv[i], "-nb")){
        omode |= QDBOLCKNB;
      } else if(!strcmp(argv[i], "-rc")){
        topts &= ~TCTNLOWER;
      } else if(!strcmp(argv[i], "-ra")){
        topts &= ~TCTNNOACC;
      } else if(!strcmp(argv[i], "-rs")){
        topts &= ~TCTNSPACE;
      } else {
        usage();
      }
    } else if(!path){
      path = argv[i];
    } else if(!idstr){
      idstr = argv[i];
    } else if(!text){
      text = argv[i];
    } else {
      usage();
    }
  }
  if(!path || !idstr || !text) usage();
  int64_t id = strtoll(idstr, NULL, 10);
  if(id < 1) usage();
  int rv = procput(path, id, text, omode, topts);
  return rv;
}


/* parse arguments of out command */
static int runout(int argc, char **argv){
  char *path = NULL;
  char *idstr = NULL;
  char *text = NULL;
  int omode = 0;
  int topts = TCTNLOWER | TCTNNOACC | TCTNSPACE;
  for(int i = 2; i < argc; i++){
    if(!path && argv[i][0] == '-'){
      if(!strcmp(argv[i], "-nl")){
        omode |= QDBONOLCK;
      } else if(!strcmp(argv[i], "-nb")){
        omode |= QDBOLCKNB;
      } else if(!strcmp(argv[i], "-rc")){
        topts &= ~TCTNLOWER;
      } else if(!strcmp(argv[i], "-ra")){
        topts &= ~TCTNNOACC;
      } else if(!strcmp(argv[i], "-rs")){
        topts &= ~TCTNSPACE;
      } else {
        usage();
      }
    } else if(!path){
      path = argv[i];
    } else if(!idstr){
      idstr = argv[i];
    } else if(!text){
      text = argv[i];
    } else {
      usage();
    }
  }
  if(!path || !idstr || !text) usage();
  int64_t id = strtoll(idstr, NULL, 10);
  if(id < 1) usage();
  int rv = procout(path, id, text, omode, topts);
  return rv;
}


/* parse arguments of search command */
static int runsearch(int argc, char **argv){
  char *path = NULL;
  char *words[SEARCHWORDMAX];
  int wnum = 0;
  int omode = 0;
  int topts = TCTNLOWER | TCTNNOACC | TCTNSPACE;
  int emode = EMISECT;
  int smode = QDBSSUBSTR;
  int max = DEFSEARCHMAX;
  bool ph = false;
  for(int i = 2; i < argc; i++){
    if(!path && argv[i][0] == '-'){
      if(!strcmp(argv[i], "-nl")){
        omode |= QDBONOLCK;
      } else if(!strcmp(argv[i], "-nb")){
        omode |= QDBOLCKNB;
      } else if(!strcmp(argv[i], "-rc")){
        topts &= ~TCTNLOWER;
      } else if(!strcmp(argv[i], "-ra")){
        topts &= ~TCTNNOACC;
      } else if(!strcmp(argv[i], "-rs")){
        topts &= ~TCTNSPACE;
      } else if(!strcmp(argv[i], "-eu")){
        emode = EMUNION;
      } else if(!strcmp(argv[i], "-ed")){
        emode = EMDIFF;
      } else if(!strcmp(argv[i], "-sp")){
        smode = QDBSPREFIX;
      } else if(!strcmp(argv[i], "-ss")){
        smode = QDBSSUFFIX;
      } else if(!strcmp(argv[i], "-sf")){
        smode = QDBSFULL;
      } else if(!strcmp(argv[i], "-max")){
        if(++i >= argc) usage();
        max = tcatoi(argv[i]);
      } else if(!strcmp(argv[i], "-ph")){
        ph = true;
      } else {
        usage();
      }
    } else if(!path){
      path = argv[i];
    } else {
      if(wnum < SEARCHWORDMAX) words[wnum++] = argv[i];
    }
  }
  if(!path || wnum < 1) usage();
  if(max < 0) max = INT_MAX;
  int rv = procsearch(path, (const char **)words, wnum, omode, topts, emode, smode, max, ph);
  return rv;
}


/* parse arguments of optimize command */
static int runoptimize(int argc, char **argv){
  char *path = NULL;
  int omode = 0;
  for(int i = 2; i < argc; i++){
    if(!path && argv[i][0] == '-'){
      if(!strcmp(argv[i], "-nl")){
        omode |= QDBONOLCK;
      } else if(!strcmp(argv[i], "-nb")){
        omode |= QDBOLCKNB;
      } else {
        usage();
      }
    } else if(!path){
      path = argv[i];
    } else {
      usage();
    }
  }
  if(!path) usage();
  int rv = procoptimize(path, omode);
  return rv;
}


/* parse arguments of importtsv command */
static int runimporttsv(int argc, char **argv){
  char *path = NULL;
  char *file = NULL;
  int64_t icsiz = 0;
  int omode = 0;
  int topts = TCTNLOWER | TCTNNOACC | TCTNSPACE;
  for(int i = 2; i < argc; i++){
    if(!path && argv[i][0] == '-'){
      if(!strcmp(argv[i], "-ic")){
        if(++i >= argc) usage();
        icsiz = tcatoix(argv[i]);
      } else if(!strcmp(argv[i], "-nl")){
        omode |= HDBONOLCK;
      } else if(!strcmp(argv[i], "-nb")){
        omode |= HDBOLCKNB;
      } else if(!strcmp(argv[i], "-rc")){
        topts &= ~TCTNLOWER;
      } else if(!strcmp(argv[i], "-ra")){
        topts &= ~TCTNNOACC;
      } else if(!strcmp(argv[i], "-rs")){
        topts &= ~TCTNSPACE;
      } else {
        usage();
      }
    } else if(!path){
      path = argv[i];
    } else if(!file){
      file = argv[i];
    } else {
      usage();
    }
  }
  if(!path) usage();
  int rv = procimporttsv(path, file, icsiz, omode, topts);
  return rv;
}


/* parse arguments of normalize command */
static int runnormalize(int argc, char **argv){
  char *path = NULL;
  char *text = NULL;
  int topts = TCTNLOWER | TCTNNOACC | TCTNSPACE;
  for(int i = 2; i < argc; i++){
    if(!path && argv[i][0] == '-'){
      if(!strcmp(argv[i], "-rc")){
        topts &= ~TCTNLOWER;
      } else if(!strcmp(argv[i], "-ra")){
        topts &= ~TCTNNOACC;
      } else if(!strcmp(argv[i], "-rs")){
        topts &= ~TCTNSPACE;
      } else {
        usage();
      }
    } else if(!text){
      text = argv[i];
    } else {
      usage();
    }
  }
  if(!text) usage();
  int rv = procnormalize(text, topts);
  return rv;
}


/* parse arguments of version command */
static int runversion(int argc, char **argv){
  int rv = procversion();
  return rv;
}


/* perform create command */
static int proccreate(const char *path, int64_t etnum, int topts){
  TCQDB *qdb = tcqdbnew();
  if(g_dbgfd >= 0) tcqdbsetdbgfd(qdb, g_dbgfd);
  if(!tcqdbtune(qdb, etnum, topts)){
    printerr(qdb);
    tcqdbdel(qdb);
    return 1;
  }
  if(!tcqdbopen(qdb, path, QDBOWRITER | QDBOCREAT | QDBOTRUNC)){
    printerr(qdb);
    tcqdbdel(qdb);
    return 1;
  }
  bool err = false;
  if(!tcqdbclose(qdb)){
    printerr(qdb);
    err = true;
  }
  tcqdbdel(qdb);
  return err ? 1 : 0;
}


/* perform inform command */
static int procinform(const char *path, int omode){
  TCQDB *qdb = tcqdbnew();
  if(g_dbgfd >= 0) tcqdbsetdbgfd(qdb, g_dbgfd);
  if(!tcqdbopen(qdb, path, QDBOREADER | omode)){
    printerr(qdb);
    tcqdbdel(qdb);
    return 1;
  }
  bool err = false;
  const char *npath = tcqdbpath(qdb);
  if(!npath) npath = "(unknown)";
  printf("path: %s\n", npath);
  printf("database type: qgram\n");
  printf("inode number: %lld\n", (long long)tcqdbinode(qdb));
  char date[48];
  tcdatestrwww(tcqdbmtime(qdb), INT_MAX, date);
  printf("modified time: %s\n", date);
  uint8_t opts = tcqdbopts(qdb);
  printf("options:");
  if(opts & QDBTLARGE) printf(" large");
  if(opts & QDBTDEFLATE) printf(" deflate");
  if(opts & QDBTTCBS) printf(" tcbs");
  printf("\n");
  printf("token number: %llu\n", (unsigned long long)tcqdbtnum(qdb));
  printf("file size: %llu\n", (unsigned long long)tcqdbfsiz(qdb));
  if(!tcqdbclose(qdb)){
    if(!err) printerr(qdb);
    err = true;
  }
  tcqdbdel(qdb);
  return err ? 1 : 0;
}


/* perform put command */
static int procput(const char *path, int64_t id, const char *text, int omode, int topts){
  TCQDB *qdb = tcqdbnew();
  if(g_dbgfd >= 0) tcqdbsetdbgfd(qdb, g_dbgfd);
  tcqdbsetsynccb(qdb, mysynccb, NULL);
  if(!tcqdbopen(qdb, path, QDBOWRITER | omode)){
    printerr(qdb);
    tcqdbdel(qdb);
    return 1;
  }
  bool err = false;
  char *ntext = tcstrdup(text);
  tctextnormalize(ntext, topts);
  if(!tcqdbput(qdb, id, ntext)){
    printerr(qdb);
    err = true;
  }
  tcfree(ntext);
  if(!tcqdbclose(qdb)){
    if(!err) printerr(qdb);
    err = true;
  }
  tcqdbdel(qdb);
  return err ? 1 : 0;
}


/* perform out command */
static int procout(const char *path, int64_t id, const char *text, int omode, int topts){
  TCQDB *qdb = tcqdbnew();
  if(g_dbgfd >= 0) tcqdbsetdbgfd(qdb, g_dbgfd);
  tcqdbsetsynccb(qdb, mysynccb, NULL);
  if(!tcqdbopen(qdb, path, QDBOWRITER | omode)){
    printerr(qdb);
    tcqdbdel(qdb);
    return 1;
  }
  bool err = false;
  char *ntext = tcstrdup(text);
  tctextnormalize(ntext, topts);
  if(!tcqdbout(qdb, id, ntext)){
    printerr(qdb);
    err = true;
  }
  tcfree(ntext);
  if(!tcqdbclose(qdb)){
    if(!err) printerr(qdb);
    err = true;
  }
  tcqdbdel(qdb);
  return err ? 1 : 0;
}


/* perform search command */
static int procsearch(const char *path, const char **words, int wnum, int omode,
                      int topts, int emode, int smode, int max, bool ph){
  TCQDB *qdb = tcqdbnew();
  if(g_dbgfd >= 0) tcqdbsetdbgfd(qdb, g_dbgfd);
  if(!tcqdbopen(qdb, path, QDBOREADER | omode)){
    printerr(qdb);
    tcqdbdel(qdb);
    return 1;
  }
  bool err = false;
  if(wnum == 1){
    char *ntext = tcstrdup(words[0]);
    tctextnormalize(ntext, topts);
    double stime = tctime();
    int rnum;
    uint64_t *res = tcqdbsearch(qdb, ntext, smode, &rnum);
    if(res){
      if(ph){
        printf("hits: %d\n", rnum);
        printf("time: %.6f\n", tctime() - stime);
        printf("\n");
      }
      max = tclmin(max, rnum);
      for(int i = 0; i < max; i++){
        printf("%llu\n", (unsigned long long)res[i]);
      }
      tcfree(res);
    } else {
      printerr(qdb);
      err = true;
    }
    tcfree(ntext);
  } else {
    double stime = tctime();
    QDBRSET rsets[wnum];
    for(int i = 0; i < wnum; i++){
      char *ntext = tcstrdup(words[i]);
      tctextnormalize(ntext, topts);
      rsets[i].ids = tcqdbsearch(qdb, ntext, smode, &rsets[i].num);
      tcfree(ntext);
    }
    int rnum;
    uint64_t *res;
    switch(emode){
      case EMUNION:
        res = tcqdbresunion(rsets, wnum, &rnum);
        break;
      case EMDIFF:
        res = tcqdbresdiff(rsets, wnum, &rnum);
        break;
      default:
        res = tcqdbresisect(rsets, wnum, &rnum);
        break;
    }
    if(ph){
      printf("hits: %d\n", rnum);
      printf("time: %.6f\n", tctime() - stime);
      printf("\n");
    }
    max = tclmin(max, rnum);
    for(int i = 0; i < max; i++){
      printf("%llu\n", (unsigned long long)res[i]);
    }
    tcfree(res);
    for(int i = 0; i < wnum; i++){
      tcfree(rsets[i].ids);
    }
  }
  if(!tcqdbclose(qdb)){
    if(!err) printerr(qdb);
    err = true;
  }
  tcqdbdel(qdb);
  return err ? 1 : 0;
}


/* perform optimize command */
static int procoptimize(const char *path, int omode){
  TCQDB *qdb = tcqdbnew();
  if(g_dbgfd >= 0) tcqdbsetdbgfd(qdb, g_dbgfd);
  if(!tcqdbopen(qdb, path, QDBOWRITER | omode)){
    printerr(qdb);
    tcqdbdel(qdb);
    return 1;
  }
  bool err = false;
  if(!tcqdboptimize(qdb)){
    printerr(qdb);
    err = true;
  }
  if(!tcqdbclose(qdb)){
    if(!err) printerr(qdb);
    err = true;
  }
  tcqdbdel(qdb);
  return err ? 1 : 0;
}


/* perform importtsv command */
static int procimporttsv(const char *path, const char *file,
                         int64_t icsiz, int omode, int topts){
  TCQDB *qdb = tcqdbnew();
  if(g_dbgfd >= 0) tcqdbsetdbgfd(qdb, g_dbgfd);
  FILE *ifp = file ? fopen(file, "rb") : stdin;
  if(!ifp){
    fprintf(stderr, "%s: could not open\n", file ? file : "(stdin)");
    tcqdbdel(qdb);
    return 1;
  }
  tcqdbsetsynccb(qdb, mysynccb, NULL);
  if(!tcqdbsetcache(qdb, icsiz, -1)) printerr(qdb);
  if(!tcqdbopen(qdb, path, QDBOWRITER | QDBOCREAT | omode)){
    printerr(qdb);
    tcqdbdel(qdb);
    return 1;
  }
  bool err = false;
  char *line;
  int cnt = 0;
  while(!err && (line = mygetline(ifp)) != NULL){
    int64_t id = strtoll(line, NULL, 10);
    char *pv = strchr(line, '\t');
    if(id == 0 || !pv){
      tcfree(line);
      continue;
    }
    *(pv++) = '\0';
    char *ntext = tcstrdup(pv);
    tctextnormalize(ntext, topts);
    if(id > 0){
      if(!tcqdbput(qdb, id, ntext)){
        printerr(qdb);
        err = true;
      }
    } else {
      if(!tcqdbout(qdb, -id, ntext)){
        printerr(qdb);
        err = true;
      }
    }
    tcfree(ntext);
    tcfree(line);
    if(cnt > 0 && cnt % 100 == 0){
      putchar('.');
      fflush(stdout);
      if(cnt % 5000 == 0) printf(" (%08d)\n", cnt);
    }
    cnt++;
  }
  printf(" (%08d)\n", cnt);
  if(!tcqdbclose(qdb)){
    if(!err) printerr(qdb);
    err = true;
  }
  tcqdbdel(qdb);
  if(ifp != stdin) fclose(ifp);
  return err ? 1 : 0;
}


/* perform normalize command */
static int procnormalize(const char *text, int topts){
  char *ntext = tcstrdup(text);
  tctextnormalize(ntext, topts);
  printf("%s\n", ntext);
  tcfree(ntext);
  return 0;
}


/* perform version command */
static int procversion(void){
  printf("Tokyo Dystopia version %s (%d:%s) for %s\n",
         tdversion, _TD_LIBVER, _TD_FORMATVER, TDSYSNAME);
  printf("Copyright (C) 2007-2010 FAL Labs\n");
  return 0;
}



// END OF FILE
