#!/bin/bash

# Public domain notice for all NCBI EDirect scripts is located at:
# https://www.ncbi.nlm.nih.gov/books/NBK179288/#chapter6.Public_Domain_Notice

total_start=$(date "+%s")

e2index=false
e2invert=false
fullIndex=false
internal=false
clean=false
scrub=false
scour=false
# revert to ftp (Aspera Connect) for now since https is currently much slower
useFtp=true
useHttps=false

while [ $# -gt 0 ]
do
  case "$1" in
    daily | -daily )
      e2index=true
      e2invert=true
      shift
      ;;
    index | -index | reindex | -reindex )
      e2index=true
      e2invert=true
      fullIndex=true
      shift
      ;;
    clean | -clean | clear | -clear )
      # delete Indices contents and Increment files
      clean=true
      shift
      ;;
    scrub | -scrub )
      clean=true
      # and delete Postings directories
      scrub=true
      shift
      ;;
    scour | -scour )
      clean=true
      scrub=true
      # and delete Archive and Sentinels directories
      scour=true
      shift
      ;;
    -internal | -int )
      # populate from files on internal network
      internal=true
      shift
      ;;
    -ftp )
      useFtp=true
      useHttps=false
      shift
      ;;
    -http | -https )
      useFtp=false
      useHttps=true
      shift
      ;;
    * )
      break
      ;;
  esac
done

while [ $# -gt 0 ]
do
  case "$1" in
    -path )
      shift
      ;;
    -* )
      exec >&2
      echo "$0: Unrecognized option $1"
      exit 1
      ;;
    * )
      break
      ;;
  esac
done

MASTER=""
WORKING=""

# get local master and working volumes from database
ev=$( rchive -local pmc )
if [ -n "$ev" ]
then
  MASTER="${ev%:*}"
  WORKING="${ev#*:}"
fi

if [ -z "$MASTER" ]
then
  echo "ERROR: Must supply path to local data by setting EDIRECT_LOCAL_MASTER environment variable" >&2
  exit 1
fi

if [ -z "$WORKING" ]
then
  echo "ERROR: Must supply path to local data by setting EDIRECT_LOCAL_WORKING environment variable" >&2
  exit 1
fi

# remove trailing slash
MASTER=${MASTER%/}
WORKING=${WORKING%/}

# remove trailing pmc folder to get volume names
MVOLUME=${MASTER%/pmc}
WVOLUME=${WORKING%/pmc}

# check for existence of parent volumes
if [ ! -d "$MVOLUME" ]
then
  echo "ERROR: Master volume $MVOLUME is absent"
  exit 1
fi

if [ ! -d "$WVOLUME" ]
then
  echo "ERROR: Working volume $WVOLUME is absent"
  exit 1
fi

# create pmc directories on volumes, if necessary
if [ ! -d "$MASTER" ]
then
  echo "Creating pmc directory on master volume $MVOLUME"
  mkdir -p "$MASTER"
fi

if [ ! -d "$WORKING" ]
then
  echo "Creating pmc directory on working volume $WVOLUME"
  mkdir -p "$WORKING"
fi

# check for presence of pmc directories on volumes
if [ ! -d "$MASTER" ]
then
  echo "ERROR: Unable to find master directory $MASTER"
  exit 1
fi

if [ ! -d "$WORKING" ]
then
  echo "ERROR: Unable to find working directory $WORKING"
  exit 1
fi

# report data locations
echo "MASTER $MASTER"
echo "WORKING $WORKING"

for dir in Archive Data Postings
do
  mkdir -p "$MASTER/$dir"
done

if [ -d "$MASTER/Sentinels" ]
then
  mv "$MASTER/Sentinels" "$MASTER/Archive"
else
  mkdir -p "$MASTER/Archive/Sentinels"
fi

if [ -d "$WORKING/PMC" ]
then
  mv "$WORKING/PMC" "$WORKING/Source"
else
  mkdir -p "$WORKING/Source"
fi

for dir in Extras Index Invert Merged
do
  mkdir -p "$WORKING/$dir"
done

pm-prepare "$MASTER" "$WORKING"

date

DWN=0
DEL=0
SCB=0
SCR=0
POP=0
IDX=0
INV=0
MRG=0
PST=0

seconds_start=$(date "+%s")
cd "$WORKING/Source"
if [ "$internal" = true ]
then
  echo "Will Use Direct Access To PMC Files On FTP Site"
else
  echo "Downloading New PMC Files"
  if [ "$useHttps" = true ]
  then
    download-pmc -https
  else
    download-pmc -ftp
  fi
fi
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
DWN=$seconds
echo "$DWN seconds"
sleep 1

if [ "$clean" = true ]
then
  seconds_start=$(date "+%s")
  echo "Deleting Incremental Indices"
  cd "$WORKING/Index"
  target="$WORKING/Index"
  find "$target" -name "*.e2x" -delete
  find "$target" -name "*.e2x.gz" -delete
  cd "$WORKING/Invert"
  target="$WORKING/Invert"
  find "$target" -name "*.inv" -delete
  find "$target" -name "*.inv.gz" -delete
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  DEL=$seconds
  echo "$DEL seconds"
  echo ""
  sleep 1
fi

RemovePosts() {

  for dir in ABST AUTH JOUR PAIR TEXT TITL YEAR UID
  do
    for sub in "${dir}"
    do
      rm -rf "${dir}/${sub}" &
    done
  done

  wait

  for dir in ABST AUTH JOUR PAIR TEXT TITL YEAR UID
  do
    rm -rf "$dir" &
  done

  wait
}

if [ "$scrub" = true ]
then
  seconds_start=$(date "+%s")
  echo "Clearing Postings Folders"
  cd "$MASTER/Postings"
  RemovePosts
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  SCB=$seconds
  echo "$SCB seconds"
  echo ""
  sleep 1
fi

RemoveArchives() {

  for dir in "$MASTER/Archive"/*
  do
    if [ -d "$dir" ]
    then
      rm -rf "$dir" &
    fi
  done

  wait
}

if [ "$scour" = true ]
then
  seconds_start=$(date "+%s")
  echo "Clearing Archive Folder"
  cd "$MASTER/Archive"
  RemoveArchives
  mkdir -p "$MASTER/Archive/Sentinels"
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  SCR=$seconds
  echo "$SCR seconds"
  echo ""
  sleep 1
fi

PMCStash() {

  fl="$1"

  base=${fl%.tar.gz}
  echo "$base"

  tar -xOzf "$fl" --to-stdout |
  pmc2info |
  transmute -mixed -format |
  rchive -gzip -db pmc \
    -archive "$MASTER/Archive" "$WORKING/Index" "$WORKING/Invert" \
    -index UID -pattern PMCInfo

  touch "$MASTER/Archive/Sentinels/$base.snt"
}

seconds_start=$(date "+%s")
echo "Populating PMC Archive"
cd "$WORKING/Source"
for flt in baseline incr
do
  for dir in oa_comm oa_noncomm oa_other
  do
    for fl in *.tar.gz
    do
      echo "$fl" | grep "$flt" | grep "$dir"
    done |
    while read fl
    do
      base=${fl%.tar.gz}
      if [ ! -f "$MASTER/Archive/Sentinels/$base.snt" ]
      then
        PMCStash "$fl"
      fi
    done
  done
done
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
POP=$seconds
echo "$POP seconds"
sleep 1

echo ""

echo 4948736 |
fetch-pmc -path "$MASTER/Archive" |
xtract -pattern Auth -if LastName -equals Adeyemo \
  -pfx "Archive is " -element Initials

echo ""

if [ "$e2index" = true ]
then
  seconds_start=$(date "+%s")
  echo "Incremental Indexing"
  rchive -db pmc -e2incIndex "$MASTER/Archive" "$WORKING/Index" -e2index
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  IDX=$seconds
  echo "IDX $IDX seconds"
  echo ""
  sleep 1
fi

if [ "$e2invert" = true ]
then
  seconds_start=$(date "+%s")
  echo "Incremental Inversion"
  rchive -db pmc -e2incInvert "$WORKING/Index" "$WORKING/Invert"
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  INV=$seconds
  echo "INV $INV seconds"
  echo ""
  sleep 1
fi

if [ "$fullIndex" = true ]
then
  seconds_start=$(date "+%s")
  echo "Merging Inverted Indices"
  cd "$WORKING/Invert"
  rchive -gzip -db pmc -merge "$WORKING/Merged" *.inv.gz
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  MRG=$seconds
  echo "MRG $MRG seconds"
  echo ""
  sleep 1
  if [ ! -f "$WORKING/Merged/zz.mrg.gz" ]
  then
    echo "ERROR: Merge failed to complete - missing zz.mrg.gz file"
    echo ""
    echo "EXITING DUE TO BUILD FAILURE"
    echo ""
    fullIndex=false
  fi
fi

if [ "$fullIndex" = true ]
then
  seconds_start=$(date "+%s")
  echo "Producing Postings Files"
  cd "$WORKING/Merged"
  term=$( echo "UID YEAR JOUR AUTH TITL ABST TEXT PAIR" )
  for fl in *.mrg.gz
  do
    echo "$fl"
  done |
  sort |
  xargs -n 100 echo |
  while read files
  do
    rchive -db pmc -promote "$MASTER/Postings" "$term" $files
  done
  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  PST=$seconds
  echo "PST $PST seconds"
  echo ""
  sleep 1
fi

okay=""
if [ "$fullIndex" = true ]
then
  okay=$( phrase-search -db pmc -path "$MASTER/Postings" -query "occupational hazards among the abattoir workers [TITL] AND 2016 [YEAR]" |
  fetch-pmc -path "$MASTER/Archive" |
  xtract -pattern AUTH -if LastName -equals Adeyemo -element Initials )
  if [ "$okay" = "OK" ]
  then
    echo "Archive and Index are $okay"
    echo ""
  fi
fi

if [ "$fullIndex" = true ]
then
  if [ "$okay" = "OK" ]
  then
    cd "$WORKING/Merged"
    target="$WORKING/Merged"
    find "$target" -name "*.mrg" -delete
    find "$target" -name "*.mrg.gz" -delete
  fi
  cd
fi

echo ""

echo "ARCHIVE-PMC"

echo ""

echo "DWN $DWN seconds"
if [ "$clean" = true ]
then
  echo "DEL $DEL seconds"
fi
if [ "$scrub" = true ]
then
  echo "SCB $SCB seconds"
fi
if [ "$scour" = true ]
then
  echo "SCR $SCR seconds"
fi
echo "POP $POP seconds"
if [ "$e2index" = true ]
then
  echo "IDX $IDX seconds"
fi
if [ "$e2invert" = true ]
then
  echo "INV $INV seconds"
fi
if [ "$fullIndex" = true ]
then
  echo "MRG $MRG seconds"
  echo "PST $PST seconds"
fi

echo ""

total_end=$(date "+%s")
total=$((total_end - total_start))
TOT=$total
echo "TOT $TOT seconds"
echo ""

date

if [ -n "$CONFIG" ]
then
  target=bash_profile
  if ! grep "$target" "$HOME/.bashrc" >/dev/null 2>&1
  then
    if [ ! -f $HOME/.$target ] || grep 'bashrc' "$HOME/.$target" >/dev/null 2>&1
    then
      target=bashrc
    fi
  fi
  echo ""
  echo "For convenience, please execute the following to save the archive path to a variable:"
  echo ""
  echo "  echo \"export EDIRECT_PMC_MASTER='${CONFIG}'\" >>" "\$HOME/.$target"
  echo ""
fi
