1 {-# LANGUAGE OverloadedStrings #-}
2 {-# LANGUAGE ScopedTypeVariables #-}
4 module Data.Gargantext.Ngrams.Parser where
6 import Data.Gargantext.Prelude
7 import Data.Gargantext.Ngrams.CoreNLP
10 import Data.Gargantext.Types.Main (Language(..), Ngrams)
11 import qualified Data.Gargantext.Ngrams.Lang.En as En
12 import qualified Data.Gargantext.Ngrams.Lang.Fr as Fr
15 -- | Ngrams selection algorithms
16 -- A form is a list of characters seperated by one or more spaces in a sentence.
20 -- For performance reasons, Type Text is used, then:
24 -- Let be a form and its associated forms in contexts of a sentence.
25 -- Forms and subfoorms can be representend as Tree whose top is the minimal form
26 -- as a monogram whos occurrences are
28 -- ps : Common words function in Haskell do not take sentence into account
31 -- TODO for scientific papers: add maesures
32 -- TODO add the p score regex
33 extractNgrams :: Language -> String -> IO [[Ngrams]]
34 extractNgrams lang s = pm (groupNgrams lang) <$> extractNgrams' lang s
37 extractNgrams' :: Language -> String -> IO [[Ngrams]]
38 extractNgrams' lang t = pm (pm token2text)
39 <$> pm _sentenceTokens
43 -- | This function selects ngrams according to grammars specific
45 -- In english, JJ is ADJectiv in french.
46 selectNgrams :: Language -> [Ngrams] -> [Ngrams]
47 selectNgrams EN = En.selectNgrams
48 selectNgrams FR = Fr.selectNgrams
50 -- | This function analyze and groups (or not) ngrams according to
51 -- grammars specific of each language.
52 groupNgrams :: Language -> [Ngrams] -> [Ngrams]
53 groupNgrams EN = En.groupNgrams
54 groupNgrams FR = Fr.groupNgrams