src/Gargantext/Ngrams/Parser.hs

   1 {-# LANGUAGE OverloadedStrings #-}
   2 {-# LANGUAGE ScopedTypeVariables #-}
   3 {-# LANGUAGE NoImplicitPrelude   #-}
   4
   5 module Gargantext.Ngrams.Parser where
   6
   7 import Gargantext.Prelude
   8 import Gargantext.Ngrams.CoreNLP
   9 import Data.Text hiding (map)
  10
  11 import Gargantext.Types.Main (Language(..), Ngrams)
  12 import qualified Gargantext.Ngrams.Lang.En as En
  13 import qualified Gargantext.Ngrams.Lang.Fr as Fr
  14
  15
  16 -- | Ngrams selection algorithms
  17 -- A form is a list of characters seperated by one or more spaces in a sentence.
  18 -- A word is a form.
  19
  20 -- type Form = [Char]
  21 -- For performance reasons, Type Text is used, then:
  22 -- type Form = Text
  23
  24
  25 -- Let be a form and its associated forms in contexts of a sentence.
  26 -- Forms and subfoorms can be representend as Tree whose top is the minimal form
  27 -- as a monogram whos occurrences are
  28
  29 -- ps : Common words function in Haskell do not take sentence into account
  30
  31
  32 -- TODO for scientific papers: add maesures
  33 -- TODO add the p score regex
  34 extractNgrams :: Language -> Text -> IO [[Ngrams]]
  35 extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
  36
  37
  38 extractNgrams' :: Language -> Text -> IO [[Ngrams]]
  39 extractNgrams' lang t =  map (map token2text)
  40                      <$> map _sentenceTokens
  41                      <$> sentences
  42                      <$> corenlp lang t
  43
  44 -- | This function selects ngrams according to grammars specific
  45 --   of each language.
  46 --   In english, JJ is ADJectiv in french.
  47 selectNgrams :: Language -> [Ngrams] -> [Ngrams]
  48 selectNgrams EN = En.selectNgrams
  49 selectNgrams FR = Fr.selectNgrams
  50
  51 -- | This function analyze and groups (or not) ngrams according to
  52 --   grammars specific of each language.
  53 groupNgrams :: Language -> [Ngrams] -> [Ngrams]
  54 groupNgrams EN = En.groupNgrams
  55 groupNgrams FR = Fr.groupNgrams
  56
  57