{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE NoImplicitPrelude   #-}

module Gargantext.Ngrams.Parser where

import Gargantext.Prelude
import Gargantext.Ngrams.CoreNLP
import Data.Text hiding (map)

import Gargantext.Types.Main (Language(..), Ngrams)
import qualified Gargantext.Ngrams.Lang.En as En
import qualified Gargantext.Ngrams.Lang.Fr as Fr


-- | Ngrams selection algorithms
-- A form is a list of characters seperated by one or more spaces in a sentence.
-- A word is a form.

-- type Form = [Char]
-- For performance reasons, Type Text is used, then:
-- type Form = Text


-- Let be a form and its associated forms in contexts of a sentence.
-- Forms and subfoorms can be representend as Tree whose top is the minimal form
-- as a monogram whos occurrences are 

-- ps : Common words function in Haskell do not take sentence into account


-- TODO for scientific papers: add maesures
-- TODO add the p score regex
extractNgrams :: Language -> Text -> IO [[Ngrams]]
extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s


extractNgrams' :: Language -> Text -> IO [[Ngrams]]
extractNgrams' lang t =  map (map token2text)
                     <$> map _sentenceTokens
                     <$> sentences
                     <$> corenlp lang t

-- | This function selects ngrams according to grammars specific
--   of each language.
--   In english, JJ is ADJectiv in french.
selectNgrams :: Language -> [Ngrams] -> [Ngrams]
selectNgrams EN = En.selectNgrams
selectNgrams FR = Fr.selectNgrams

-- | This function analyze and groups (or not) ngrams according to 
--   grammars specific of each language.
groupNgrams :: Language -> [Ngrams] -> [Ngrams]
groupNgrams EN = En.groupNgrams
groupNgrams FR = Fr.groupNgrams