#!/bin/bash

# Public domain notice for all NCBI EDirect scripts is located at:
# https://www.ncbi.nlm.nih.gov/books/NBK179288/#chapter6.Public_Domain_Notice

total_start=$(date "+%s")

datafiles=false
populate=false
e2index=false
e2invert=false
fullIndex=false
internal=false
clean=false
scrub=false
# revert to ftp (Aspera Connect) for now since https is currently much slower
useFtp=true
useHttps=false

while [ $# -gt 0 ]
do
  case "$1" in
    setup | -setup )
      datafiles=true
      shift
      ;;
    daily | -daily )
      e2index=true
      e2invert=true
      datafiles=true
      populate=true
      shift
      ;;
    index | -index | reindex | -reindex )
      e2index=true
      e2invert=true
      fullIndex=true
      datafiles=true
      populate=true
      shift
      ;;
    clean | -clean | clear | -clear )
      # delete Indices contents and Increment files
      clean=true
      shift
      ;;
    scrub | -scrub )
      clean=true
      # and delete Postings directories
      scrub=true
      shift
      ;;
    -internal | -int )
      # populate from files on internal network
      internal=true
      shift
      ;;
    -ftp )
      useFtp=true
      useHttps=false
      shift
      ;;
    -http | -https )
      useFtp=false
      useHttps=true
      shift
      ;;
    * )
      break
      ;;
  esac
done

while [ $# -gt 0 ]
do
  case "$1" in
    -path )
      shift
      ;;
    -* )
      exec >&2
      echo "$0: Unrecognized option $1"
      exit 1
      ;;
    * )
      break
      ;;
  esac
done

MASTER=""
WORKING=""

# get local master and working volumes from database
ev=$( rchive -local taxonomy )
if [ -n "$ev" ]
then
  MASTER="${ev%:*}"
  WORKING="${ev#*:}"
fi

if [ -z "$MASTER" ]
then
  echo "ERROR: Must supply path to local data by setting EDIRECT_LOCAL_MASTER environment variable" >&2
  exit 1
fi

if [ -z "$WORKING" ]
then
  echo "ERROR: Must supply path to local data by setting EDIRECT_LOCAL_WORKING environment variable" >&2
  exit 1
fi

# remove trailing slash
MASTER=${MASTER%/}
WORKING=${WORKING%/}

# remove trailing taxonomy folder to get volume names
MVOLUME=${MASTER%/taxonomy}
WVOLUME=${WORKING%/taxonomy}

# check for existence of parent volumes
if [ ! -d "$MVOLUME" ]
then
  echo "ERROR: Master volume $MVOLUME is absent"
  exit 1
fi

if [ ! -d "$WVOLUME" ]
then
  echo "ERROR: Working volume $WVOLUME is absent"
  exit 1
fi

# create taxonomy directories on volumes, if necessary
if [ ! -d "$MASTER" ]
then
  echo "Creating taxonomy directory on master volume $MVOLUME"
  mkdir -p "$MASTER"
fi

if [ ! -d "$WORKING" ]
then
  echo "Creating taxonomy directory on working volume $WVOLUME"
  mkdir -p "$WORKING"
fi

# check for presence of taxonomy directories on volumes
if [ ! -d "$MASTER" ]
then
  echo "ERROR: Unable to find master directory $MASTER"
  exit 1
fi

if [ ! -d "$WORKING" ]
then
  echo "ERROR: Unable to find working directory $WORKING"
  exit 1
fi

# report data locations
echo "MASTER $MASTER"
echo "WORKING $WORKING"

for dir in Archive Data Postings
do
  mkdir -p "$MASTER/$dir"
done

if [ -d "$MASTER/Sentinels" ]
then
  mv "$MASTER/Sentinels" "$MASTER/Archive"
else
  mkdir -p "$MASTER/Archive/Sentinels"
fi

# no Source directory, raw data in Extras
for dir in Extras Index Invert Merged
do
  mkdir -p "$WORKING/$dir"
done

pm-prepare "$MASTER" "$WORKING"

date

DWN=0
DEL=0
SCB=0
POP=0
IDX=0
INV=0
MRG=0
PST=0

if [ "$clean" = true ]
then
  seconds_start=$(date "+%s")
  echo "Deleting Data Files"
  cd "$MASTER/Data"
  target="$MASTER/Data"
  find "$target" -name "*.xml" -delete
  echo "Deleting Extra Files"
  cd "$WORKING/Extras"
  target="$WORKING/Extras"
  find "$target" -name "*.xml" -delete
  find "$target" -name "*.dmp" -delete
  find "$target" -name "*.tar.gz" -delete
  echo "Deleting Incremental Indices"
  cd "$WORKING/Index"
  target="$WORKING/Index"
  find "$target" -name "*.e2x" -delete
  find "$target" -name "*.e2x.gz" -delete
  cd "$WORKING/Invert"
  target="$WORKING/Invert"
  find "$target" -name "*.inv" -delete
  find "$target" -name "*.inv.gz" -delete
  cd "$WORKING/Merged"
  target="$WORKING/Merged"
  find "$target" -name "*.mrg" -delete
  find "$target" -name "*.mrg.gz" -delete
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  DEL=$seconds
  echo "$DEL seconds"
  echo ""
  sleep 1
fi

RemovePosts() {

  for dir in SCIN RANK GNSP COMN TXDV TXSY \
             TAXA LNGE TREE MODS PROP VCHR \
             GC HGC MGC PGC UID
  do
    for sub in "${dir}"
    do
      rm -rf "${dir}/${sub}" &
    done
  done

  wait

  for dir in SCIN RANK GNSP COMN TXDV TXSY \
             TAXA LNGE TREE MODS PROP VCHR \
             GC HGC MGC PGC UID
             
  do
    rm -rf "$dir" &
  done

  wait
}

if [ "$scrub" = true ]
then
  seconds_start=$(date "+%s")
  echo "Clearing Postings Folders"
  cd "$MASTER/Postings"
  RemovePosts
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  SCB=$seconds
  echo "$SCB seconds"
  echo ""
  sleep 1
fi

if [ "$datafiles" = true ]
then
  seconds_start=$(date "+%s")
  cd "$WORKING/Extras"
  if [ "$internal" = true ]
  then
    echo "Will Use Direct Access To Taxonomy Files On FTP Site"
  else
    echo "Downloading Taxonomy Files"
    if [ ! -f "new_taxdump.tar.gz" ]
    then
      if [ "$useFtp" = true ]
      then
        nquire -asp ftp.ncbi.nlm.nih.gov "pub/taxonomy/new_taxdump" "new_taxdump.tar.gz"
      elif [ "$useHttps" = true ]
      then
        nquire -bulk -get https://ftp.ncbi.nlm.nih.gov pub/taxonomy/new_taxdump new_taxdump.tar.gz > new_taxdump.tar.gz
      fi
    fi
  fi
  # expand components
  if [ "$useFtp" = true ]
  then
    download-ncbi-data -ftp taxoninfo
  elif [ "$useHttps" = true ]
  then
    download-ncbi-data -https taxoninfo
  fi
  # integrate into taxoninfo source file
  rm -f taxoninfo.xml
  rchive -taxon "$WORKING/Extras" AC > taxoninfo.xml
  if [ ! -f "$MASTER/Data/taxoninfo.xml" ] && [ -f "$WORKING/Extras/taxoninfo.xml" ]
  then
    cp "$WORKING/Extras/taxoninfo.xml" "$MASTER/Data/taxoninfo.xml"
  fi
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  DWN=$seconds
  echo "$DWN seconds"
  sleep 1
fi

if [ "$populate" = true ]
then
  seconds_start=$(date "+%s")
  echo "Populating Taxonomy Archive"
  cd "$WORKING/Extras"
  cat taxoninfo.xml |
  rchive -gzip -db taxonomy \
    -archive "$MASTER/Archive" "$WORKING/Index" "$WORKING/Invert" \
    -index TaxID -pattern TaxonInfo
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  POP=$seconds
  echo "$POP seconds"
  echo ""
  sleep 1
fi

echo 1949059 |
fetch-taxonomy -path "$MASTER/Archive" |
xtract -pattern TaxonInfo -sep "" -pfx "Archive is " \
  -upper "Genus[1:1],Species[1:1]"

echo ""

if [ "$e2index" = true ]
then
  seconds_start=$(date "+%s")
  echo "Incremental Indexing"
  rchive -db taxonomy -e2incIndex "$MASTER/Archive" "$WORKING/Index" -e2index
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  IDX=$seconds
  echo "IDX $IDX seconds"
  echo ""
  sleep 1
fi

if [ "$e2invert" = true ]
then
  seconds_start=$(date "+%s")
  echo "Incremental Inversion"
  rchive -db taxonomy -e2incInvert "$WORKING/Index" "$WORKING/Invert"
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  INV=$seconds
  echo "INV $INV seconds"
  echo ""
  sleep 1
fi

if [ "$fullIndex" = true ]
then
  seconds_start=$(date "+%s")
  echo "Merging Inverted Indices"
  cd "$WORKING/Invert"
  rchive -gzip -db taxonomy -merge "$WORKING/Merged" *.inv.gz
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  MRG=$seconds
  echo "MRG $MRG seconds"
  echo ""
  sleep 1
  # taxonomy only goes to zyzzyzus warreni
  if [ ! -f "$WORKING/Merged/zy.mrg.gz" ]
  then
    echo "ERROR: Merge failed to complete - missing zy.mrg.gz file"
    echo ""
    echo "EXITING DUE TO BUILD FAILURE"
    echo ""
    fullIndex=false
  fi
fi

if [ "$fullIndex" = true ]
then
  seconds_start=$(date "+%s")
  echo "Producing Postings Files"
  cd "$WORKING/Merged"
  term=$( echo " SCIN RANK GNSP COMN TXDV TXSY TAXA LNGE TREE MODS PROP VCHR GC HGC MGC PGC UID" )
  for fl in *.mrg.gz
  do
    echo "$fl"
  done |
  sort |
  xargs -n 100 echo |
  while read files
  do
    rchive -db taxonomy -promote "$MASTER/Postings" "$term" $files
  done
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  PST=$seconds
  echo "PST $PST seconds"
  echo ""
  sleep 1
fi

okay=""
if [ "$fullIndex" = true ]
then
  okay=$( phrase-search -db taxonomy -query "ochlandra keralensis [GNSP]" |
  fetch-taxonomy -path "$MASTER/Archive" |
  xtract -pattern TaxonInfo -sep "" -upper "Genus[1:1],Species[1:1]" )
  if [ "$okay" = "OK" ]
  then
    echo "Archive and Index are $okay"
    echo ""
  fi
fi

if [ "$fullIndex" = true ]
then
  if [ "$okay" = "OK" ]
  then
    cd "$WORKING/Merged"
    target="$WORKING/Merged"
    find "$target" -name "*.mrg" -delete
    find "$target" -name "*.mrg.gz" -delete
  fi
  cd
fi

echo ""

echo "ARCHIVE-TAXONOMY"

echo ""

echo "DWN $DWN seconds"
if [ "$clean" = true ]
then
  echo "DEL $DEL seconds"
fi
if [ "$scrub" = true ]
then
  echo "SCB $SCB seconds"
fi
echo "POP $POP seconds"

if [ "$e2index" = true ]
then
  echo "IDX $IDX seconds"
fi
if [ "$e2invert" = true ]
then
  echo "INV $INV seconds"
fi
if [ "$fullIndex" = true ]
then
  echo "MRG $MRG seconds"
  echo "PST $PST seconds"
fi

echo ""

total_end=$(date "+%s")
total=$((total_end - total_start))
TOT=$total
echo "TOT $TOT seconds"
echo ""

date

if [ -n "$CONFIG" ]
then
  target=bash_profile
  if ! grep "$target" "$HOME/.bashrc" >/dev/null 2>&1
  then
    if [ ! -f $HOME/.$target ] || grep 'bashrc' "$HOME/.$target" >/dev/null 2>&1
    then
      target=bashrc
    fi
  fi
  echo ""
  echo "For convenience, please execute the following to save the archive path to a variable:"
  echo ""
  echo "  echo \"export EDIRECT_TAXONOMY_MASTER='${CONFIG}'\" >>" "\$HOME/.$target"
  echo ""
fi
