1 {-# LANGUAGE OverloadedStrings #-}
2 {-# LANGUAGE ScopedTypeVariables #-}
3 {-# LANGUAGE NoImplicitPrelude #-}
5 module Gargantext.Ngrams.Parser where
7 import Gargantext.Prelude
8 import Gargantext.Ngrams.CoreNLP
9 import Data.Text hiding (map)
11 import Gargantext.Types.Main (Language(..))
12 import qualified Gargantext.Ngrams.Lang.En as En
13 import qualified Gargantext.Ngrams.Lang.Fr as Fr
15 type SNgrams = (Text, Text, Text)
17 -- | Ngrams selection algorithms
18 -- A form is a list of characters seperated by one or more spaces in a sentence.
22 -- For performance reasons, Type Text is used, then:
26 -- Let be a form and its associated forms in contexts of a sentence.
27 -- Forms and subfoorms can be representend as Tree whose top is the minimal form
28 -- as a monogram whos occurrences are
30 -- ps : Common words function in Haskell do not take sentence into account
33 -- TODO for scientific papers: add maesures
34 -- TODO add the p score regex
35 extractNgrams :: Language -> Text -> IO [[SNgrams]]
36 extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
39 extractNgrams' :: Language -> Text -> IO [[SNgrams]]
40 extractNgrams' lang t = map (map token2text)
41 <$> map _sentenceTokens
45 -- | This function selects ngrams according to grammars specific
47 -- In english, JJ is ADJectiv in french.
48 selectNgrams :: Language -> [SNgrams] -> [SNgrams]
49 selectNgrams EN = En.selectNgrams
50 selectNgrams FR = Fr.selectNgrams
52 -- | This function analyze and groups (or not) ngrams according to
53 -- grammars specific of each language.
54 groupNgrams :: Language -> [SNgrams] -> [SNgrams]
55 groupNgrams EN = En.groupNgrams
56 groupNgrams FR = Fr.groupNgrams