src/Gargantext/Text.hs

   1 {-|
   2 Module      : Gargantext.Text
   3 Description : Ngrams tools
   4 Copyright   : (c) CNRS, 2018
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Ngrams exctration.
  11
  12 Definitions of ngrams.
  13 n non negative integer
  14
  15 -}
  16
  17 {-# LANGUAGE NoImplicitPrelude #-}
  18
  19 module Gargantext.Text ( module Gargantext.Text.Letters
  20                               --, module Gargantext.Text.Hetero
  21                          , module Gargantext.Text.CoreNLP
  22                          , module Gargantext.Text.Parser
  23                          , module Gargantext.Text.Occurrences
  24                          , module Gargantext.Text.TextMining
  25                          , module Gargantext.Text.Metrics
  26                          , Ngrams(..), ngrams, occ, sumOcc, text2fis, clean
  27                          , ListName(..), equivNgrams, isGram, sentences
  28                          , ngramsTest
  29                          ) where
  30
  31 import Gargantext.Text.Letters
  32 --import Gargantext.Text.Hetero
  33 import Gargantext.Text.CoreNLP
  34 import Gargantext.Text.Parser
  35
  36 import Gargantext.Text.Occurrences
  37 import Gargantext.Text.TextMining
  38 --import Gargantext.Text.Words
  39
  40 import Gargantext.Text.Metrics
  41 import qualified Gargantext.Text.FrequentItemSet as FIS
  42 -----------------------------------------------------------------
  43
  44 import Data.List (sort)
  45 import Data.Char (Char, isAlphaNum, isSpace)
  46 import Data.Text (Text, filter, toLower, split, lines, concat)
  47 import qualified Data.Text as DT
  48 import Data.Text.IO (readFile)
  49
  50 import Data.Map.Strict  (Map
  51                         , empty
  52                         , insertWith, unionWith
  53                         , lookupIndex
  54                         --, fromList, keys
  55                         )
  56 import qualified Data.Map.Strict as M (filter)
  57 import Data.Foldable (foldl')
  58 import Gargantext.Prelude hiding (filter)
  59
  60 -- Maybe useful later:
  61 --import NLP.Stemmer (stem, Stemmer(..))
  62 --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
  63 --import Language.Aspell.Options (ACOption(..))
  64
  65
  66 data ListName = Stop | Candidate | Graph
  67   deriving (Show, Eq)
  68
  69 data Ngrams = Ngrams { _ngramsNgrams   :: [Text]
  70                      , _ngramsStem     :: [Text]
  71                      , _ngramsListName :: Maybe ListName
  72                      } deriving (Show)
  73
  74 equivNgrams :: Ngrams -> Ngrams -> Bool
  75 equivNgrams  (Ngrams n1 s1 _) (Ngrams n2 s2 _)
  76   = (sort n1) == (sort n2) || (sort s1) == (sort s2)
  77
  78 type Occ     = Int
  79 --type Index   = Int
  80
  81 -- Data Ngrams = Monograms | MultiGrams
  82
  83 ngrams :: Text -> [Text]
  84 ngrams xs = monograms $ toLower $ filter isGram xs
  85
  86 clean :: Text -> Text
  87 clean txt = DT.map clean' txt
  88   where
  89     clean' '’' = '\''
  90     clean' c  = c
  91
  92 monograms :: Text -> [Text]
  93 monograms txt = split isWord txt
  94   where
  95     isWord c = c `elem` [' ', '\'', ',', ';']
  96
  97 isGram :: Char -> Bool
  98 isGram  c  = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
  99
 100 -- | Compute the occurrences (occ)
 101 occ :: Ord a => [a] -> Map a Occ
 102 occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
 103
 104 -- TODO add groups and filter stops
 105 sumOcc :: Ord a => [Map a Occ] -> Map a Occ
 106 sumOcc xs = foldl' (unionWith (+)) empty xs
 107
 108 --noApax :: Ord a => Map a Occ -> Map a Occ
 109 --noApax m = M.filter (>1) m
 110
 111 -- | /!\ indexes are not the same:
 112
 113 -- | Index ngrams from Map
 114 --indexNgram :: Ord a => Map a Occ -> Map Index a
 115 --indexNgram m = fromList (zip [1..] (keys m))
 116
 117 -- | Index ngrams from Map
 118 --ngramIndex :: Ord a => Map a Occ -> Map a Index
 119 --ngramIndex m = fromList (zip (keys m) [1..])
 120
 121 indexWith :: Ord a => Map a Occ -> [a] -> [Int]
 122 indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
 123
 124 indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
 125 indexIt xs = (m, is)
 126   where
 127     m  = sumOcc (map occ  xs)
 128     is = map    (indexWith m) xs
 129
 130 list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
 131 list2fis n xs = (m', fs)
 132   where
 133     (m, is) = indexIt xs
 134     m'      = M.filter (>50000) m
 135     fs      = FIS.all n is
 136
 137 text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
 138 text2fis n xs = list2fis n (map ngrams xs)
 139
 140 --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
 141 --text2fisWith = undefined
 142
 143 -------------------------------------------------------------------
 144 -- Contexts of text
 145
 146 sentences :: Text -> [Text]
 147 sentences txt = split isStop txt
 148
 149 isStop :: Char -> Bool
 150 isStop c = c `elem` ['.','?','!']
 151
 152
 153 -- | Tests
 154 ngramsTest fp =  ws
 155   where
 156     txt = concat <$> lines <$> clean <$> readFile fp
 157     -- | Number of sentences
 158     ls   = sentences <$> txt
 159     -- | Number of monograms used in the full text
 160     ws   = ngrams    <$> txt
 161     -- | stem ngrams
 162     -- TODO
 163     -- group ngrams
 164     ocs  = occ       <$> ws
 165
 166 --
 167
 168