{-|
Module      : Gargantext.Ngrams.Parser
Description : 
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Here is a longer description of this module, containing some
commentary with @some markup@.
-}

{-# LANGUAGE NoImplicitPrelude   #-}
{-# LANGUAGE OverloadedStrings   #-}
{-# LANGUAGE ScopedTypeVariables #-}

module Gargantext.Ngrams.Parser where

import Gargantext.Prelude
import Gargantext.Ngrams.CoreNLP
import Data.Text hiding (map)

import Gargantext.Types.Main (Language(..))
import qualified Gargantext.Ngrams.Lang.En as En
import qualified Gargantext.Ngrams.Lang.Fr as Fr

type SNgrams       = (Text, Text, Text)

-- | Ngrams selection algorithms
-- A form is a list of characters seperated by one or more spaces in a sentence.
-- A word is a form.

-- type Form = [Char]
-- For performance reasons, Type Text is used, then:
-- type Form = Text


-- Let be a form and its associated forms in contexts of a sentence.
-- Forms and subfoorms can be representend as Tree whose top is the minimal form
-- as a monogram whos occurrences are 

-- ps : Common words function in Haskell do not take sentence into account


-- TODO for scientific papers: add maesures
-- TODO add the p score regex
extractNgrams :: Language -> Text -> IO [[SNgrams]]
extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s


extractNgrams' :: Language -> Text -> IO [[SNgrams]]
extractNgrams' lang t =  map (map token2text)
                     <$> map _sentenceTokens
                     <$> _sentences
                     <$> corenlp lang t

-- | This function selects ngrams according to grammars specific
--   of each language.
--   In english, JJ is ADJectiv in french.
selectNgrams :: Language -> [SNgrams] -> [SNgrams]
selectNgrams EN = En.selectNgrams
selectNgrams FR = Fr.selectNgrams

-- | This function analyze and groups (or not) ngrams according to 
--   grammars specific of each language.
groupNgrams :: Language -> [SNgrams] -> [SNgrams]
groupNgrams EN = En.groupNgrams
groupNgrams FR = Fr.groupNgrams