src/Gargantext/Ngrams/Parser.hs

   1 {-# LANGUAGE OverloadedStrings #-}
   2 {-# LANGUAGE ScopedTypeVariables #-}
   3 {-# LANGUAGE NoImplicitPrelude   #-}
   4
   5 module Gargantext.Ngrams.Parser where
   6
   7 import Gargantext.Prelude
   8 import Gargantext.Ngrams.CoreNLP
   9 import Data.Text hiding (map)
  10
  11 import Gargantext.Types.Main (Language(..))
  12 import qualified Gargantext.Ngrams.Lang.En as En
  13 import qualified Gargantext.Ngrams.Lang.Fr as Fr
  14
  15 type SNgrams       = (Text, Text, Text)
  16
  17 -- | Ngrams selection algorithms
  18 -- A form is a list of characters seperated by one or more spaces in a sentence.
  19 -- A word is a form.
  20
  21 -- type Form = [Char]
  22 -- For performance reasons, Type Text is used, then:
  23 -- type Form = Text
  24
  25
  26 -- Let be a form and its associated forms in contexts of a sentence.
  27 -- Forms and subfoorms can be representend as Tree whose top is the minimal form
  28 -- as a monogram whos occurrences are
  29
  30 -- ps : Common words function in Haskell do not take sentence into account
  31
  32
  33 -- TODO for scientific papers: add maesures
  34 -- TODO add the p score regex
  35 extractNgrams :: Language -> Text -> IO [[SNgrams]]
  36 extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
  37
  38
  39 extractNgrams' :: Language -> Text -> IO [[SNgrams]]
  40 extractNgrams' lang t =  map (map token2text)
  41                      <$> map _sentenceTokens
  42                      <$> _sentences
  43                      <$> corenlp lang t
  44
  45 -- | This function selects ngrams according to grammars specific
  46 --   of each language.
  47 --   In english, JJ is ADJectiv in french.
  48 selectNgrams :: Language -> [SNgrams] -> [SNgrams]
  49 selectNgrams EN = En.selectNgrams
  50 selectNgrams FR = Fr.selectNgrams
  51
  52 -- | This function analyze and groups (or not) ngrams according to
  53 --   grammars specific of each language.
  54 groupNgrams :: Language -> [SNgrams] -> [SNgrams]
  55 groupNgrams EN = En.groupNgrams
  56 groupNgrams FR = Fr.groupNgrams
  57