{-|
Module      : CleanCsvCorpus.hs
Description : Gargantext starter
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Given a Gargantext CSV File and its Query This script cleans and
compress the contexts around the main terms of the query.
-}

{-# LANGUAGE NoImplicitPrelude #-}

module CleanCsvCorpus  where

--import GHC.IO (FilePath)
import Data.SearchEngine as S
import qualified Data.Set as S
import Data.Text (pack)
import Data.Vector (Vector)
import qualified Data.Vector as V

import Gargantext.Prelude
import Gargantext.Text.Search
import qualified Gargantext.Text.Parsers.CSV as CSV
------------------------------------------------------------------------

type Query = [S.Term]

filterDocs :: [DocId] -> Vector CSV.CsvGargV3 -> Vector CSV.CsvGargV3
filterDocs docIds = V.filter (\doc -> S.member (CSV.d_docId doc) $ S.fromList docIds )


main :: IO ()
main = do
  let rPath = "/tmp/Gargantext_Corpus.csv"
  let wPath = "/tmp/Gargantext_Corpus_bis.csv"
  --let q = ["water", "scarcity", "morocco", "shortage","flood"]
  let q = ["gratuit", "gratuité", "culture", "culturel"]

  (h,csvDocs) <- CSV.readFile rPath

  putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
  putStrLn $ "Mean size of docs:" <> show ( CSV.docsSize csvDocs)

  let docs   = CSV.toDocs csvDocs
  let engine = insertDocs docs initialDocSearchEngine
  let docIds = S.query engine (map pack q)
  let docs'  = CSV.fromDocs $ filterDocs docIds (V.fromList docs)

  putStrLn $ "Number of documents after:" <> show (V.length docs')
  putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')

  CSV.writeFile wPath (h, docs')