2 Module : Gargantext.Ngrams.Parser
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE ScopedTypeVariables #-}
18 module Gargantext.Ngrams.Parser where
20 import Gargantext.Prelude
21 import Gargantext.Ngrams.CoreNLP
22 import Data.Text hiding (map)
24 import Gargantext.Types.Main (Language(..))
25 import qualified Gargantext.Ngrams.Lang.En as En
26 import qualified Gargantext.Ngrams.Lang.Fr as Fr
28 type SNgrams = (Text, Text, Text)
30 -- | Ngrams selection algorithms
31 -- A form is a list of characters seperated by one or more spaces in a sentence.
35 -- For performance reasons, Type Text is used, then:
39 -- Let be a form and its associated forms in contexts of a sentence.
40 -- Forms and subfoorms can be representend as Tree whose top is the minimal form
41 -- as a monogram whos occurrences are
43 -- ps : Common words function in Haskell do not take sentence into account
46 -- TODO for scientific papers: add maesures
47 -- TODO add the p score regex
48 extractNgrams :: Language -> Text -> IO [[SNgrams]]
49 extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
52 extractNgrams' :: Language -> Text -> IO [[SNgrams]]
53 extractNgrams' lang t = map (map token2text)
54 <$> map _sentenceTokens
58 -- | This function selects ngrams according to grammars specific
60 -- In english, JJ is ADJectiv in french.
61 selectNgrams :: Language -> [SNgrams] -> [SNgrams]
62 selectNgrams EN = En.selectNgrams
63 selectNgrams FR = Fr.selectNgrams
65 -- | This function analyze and groups (or not) ngrams according to
66 -- grammars specific of each language.
67 groupNgrams :: Language -> [SNgrams] -> [SNgrams]
68 groupNgrams EN = En.groupNgrams
69 groupNgrams FR = Fr.groupNgrams