]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams/Parser.hs
[CLEAN] Code.
[gargantext.git] / src / Gargantext / Ngrams / Parser.hs
1 {-# LANGUAGE OverloadedStrings #-}
2 {-# LANGUAGE ScopedTypeVariables #-}
3 {-# LANGUAGE NoImplicitPrelude #-}
4
5 module Gargantext.Ngrams.Parser where
6
7 import Gargantext.Prelude
8 import Gargantext.Ngrams.CoreNLP
9 import Data.Text hiding (map)
10
11 import Gargantext.Types.Main (Language(..), Ngrams)
12 import qualified Gargantext.Ngrams.Lang.En as En
13 import qualified Gargantext.Ngrams.Lang.Fr as Fr
14
15
16 -- | Ngrams selection algorithms
17 -- A form is a list of characters seperated by one or more spaces in a sentence.
18 -- A word is a form.
19
20 -- type Form = [Char]
21 -- For performance reasons, Type Text is used, then:
22 -- type Form = Text
23
24
25 -- Let be a form and its associated forms in contexts of a sentence.
26 -- Forms and subfoorms can be representend as Tree whose top is the minimal form
27 -- as a monogram whos occurrences are
28
29 -- ps : Common words function in Haskell do not take sentence into account
30
31
32 -- TODO for scientific papers: add maesures
33 -- TODO add the p score regex
34 extractNgrams :: Language -> Text -> IO [[Ngrams]]
35 extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
36
37
38 extractNgrams' :: Language -> Text -> IO [[Ngrams]]
39 extractNgrams' lang t = map (map token2text)
40 <$> map _sentenceTokens
41 <$> sentences
42 <$> corenlp lang t
43
44 -- | This function selects ngrams according to grammars specific
45 -- of each language.
46 -- In english, JJ is ADJectiv in french.
47 selectNgrams :: Language -> [Ngrams] -> [Ngrams]
48 selectNgrams EN = En.selectNgrams
49 selectNgrams FR = Fr.selectNgrams
50
51 -- | This function analyze and groups (or not) ngrams according to
52 -- grammars specific of each language.
53 groupNgrams :: Language -> [Ngrams] -> [Ngrams]
54 groupNgrams EN = En.groupNgrams
55 groupNgrams FR = Fr.groupNgrams
56
57