1 {-# LANGUAGE OverloadedStrings #-}
2 {-# LANGUAGE ScopedTypeVariables #-}
3 {-# LANGUAGE NoImplicitPrelude #-}
5 module Gargantext.Ngrams.Parser where
7 import Gargantext.Prelude
8 import Gargantext.Ngrams.CoreNLP
9 import Data.Text hiding (map)
11 import Gargantext.Types.Main (Language(..), Ngrams)
12 import qualified Gargantext.Ngrams.Lang.En as En
13 import qualified Gargantext.Ngrams.Lang.Fr as Fr
16 -- | Ngrams selection algorithms
17 -- A form is a list of characters seperated by one or more spaces in a sentence.
21 -- For performance reasons, Type Text is used, then:
25 -- Let be a form and its associated forms in contexts of a sentence.
26 -- Forms and subfoorms can be representend as Tree whose top is the minimal form
27 -- as a monogram whos occurrences are
29 -- ps : Common words function in Haskell do not take sentence into account
32 -- TODO for scientific papers: add maesures
33 -- TODO add the p score regex
34 extractNgrams :: Language -> Text -> IO [[Ngrams]]
35 extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
38 extractNgrams' :: Language -> Text -> IO [[Ngrams]]
39 extractNgrams' lang t = map (map token2text)
40 <$> map _sentenceTokens
44 -- | This function selects ngrams according to grammars specific
46 -- In english, JJ is ADJectiv in french.
47 selectNgrams :: Language -> [Ngrams] -> [Ngrams]
48 selectNgrams EN = En.selectNgrams
49 selectNgrams FR = Fr.selectNgrams
51 -- | This function analyze and groups (or not) ngrams according to
52 -- grammars specific of each language.
53 groupNgrams :: Language -> [Ngrams] -> [Ngrams]
54 groupNgrams EN = En.groupNgrams
55 groupNgrams FR = Fr.groupNgrams