]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams/Parser.hs
[FIS][FIX] Frequent Item Set and fix ngrams extraction test.
[gargantext.git] / src / Gargantext / Ngrams / Parser.hs
1 {-# LANGUAGE OverloadedStrings #-}
2 {-# LANGUAGE ScopedTypeVariables #-}
3 {-# LANGUAGE NoImplicitPrelude #-}
4
5 module Gargantext.Ngrams.Parser where
6
7 import Gargantext.Prelude
8 import Gargantext.Ngrams.CoreNLP
9 import Data.Text hiding (map)
10
11 import Gargantext.Types.Main (Language(..))
12 import qualified Gargantext.Ngrams.Lang.En as En
13 import qualified Gargantext.Ngrams.Lang.Fr as Fr
14
15 type SNgrams = (Text, Text, Text)
16
17 -- | Ngrams selection algorithms
18 -- A form is a list of characters seperated by one or more spaces in a sentence.
19 -- A word is a form.
20
21 -- type Form = [Char]
22 -- For performance reasons, Type Text is used, then:
23 -- type Form = Text
24
25
26 -- Let be a form and its associated forms in contexts of a sentence.
27 -- Forms and subfoorms can be representend as Tree whose top is the minimal form
28 -- as a monogram whos occurrences are
29
30 -- ps : Common words function in Haskell do not take sentence into account
31
32
33 -- TODO for scientific papers: add maesures
34 -- TODO add the p score regex
35 extractNgrams :: Language -> Text -> IO [[SNgrams]]
36 extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
37
38
39 extractNgrams' :: Language -> Text -> IO [[SNgrams]]
40 extractNgrams' lang t = map (map token2text)
41 <$> map _sentenceTokens
42 <$> _sentences
43 <$> corenlp lang t
44
45 -- | This function selects ngrams according to grammars specific
46 -- of each language.
47 -- In english, JJ is ADJectiv in french.
48 selectNgrams :: Language -> [SNgrams] -> [SNgrams]
49 selectNgrams EN = En.selectNgrams
50 selectNgrams FR = Fr.selectNgrams
51
52 -- | This function analyze and groups (or not) ngrams according to
53 -- grammars specific of each language.
54 groupNgrams :: Language -> [SNgrams] -> [SNgrams]
55 groupNgrams EN = En.groupNgrams
56 groupNgrams FR = Fr.groupNgrams
57