2 Module : Gargantext.Text
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 Definitions of ngrams.
13 n non negative integer
17 {-# LANGUAGE NoImplicitPrelude #-}
19 module Gargantext.Text ( module Gargantext.Text.Letters
20 --, module Gargantext.Text.Hetero
21 , module Gargantext.Text.CoreNLP
22 , module Gargantext.Text.Parser
23 , module Gargantext.Text.Occurrences
24 , module Gargantext.Text.TextMining
25 , module Gargantext.Text.Metrics
26 , Ngrams(..), ngrams, occ, sumOcc, text2fis, clean
27 , ListName(..), equivNgrams, isGram, sentences
31 import Gargantext.Text.Letters
32 --import Gargantext.Text.Hetero
33 import Gargantext.Text.CoreNLP
34 import Gargantext.Text.Parser
36 import Gargantext.Text.Occurrences
37 import Gargantext.Text.TextMining
38 --import Gargantext.Text.Words
40 import Gargantext.Text.Metrics
41 import qualified Gargantext.Text.FrequentItemSet as FIS
42 -----------------------------------------------------------------
44 import Data.List (sort)
45 import Data.Char (Char, isAlphaNum, isSpace)
46 import Data.Text (Text, filter, toLower, split, lines, concat)
47 import qualified Data.Text as DT
48 import Data.Text.IO (readFile)
50 import Data.Map.Strict (Map
52 , insertWith, unionWith
56 import qualified Data.Map.Strict as M (filter)
57 import Data.Foldable (foldl')
58 import Gargantext.Prelude hiding (filter)
60 -- Maybe useful later:
61 --import NLP.Stemmer (stem, Stemmer(..))
62 --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
63 --import Language.Aspell.Options (ACOption(..))
66 data ListName = Stop | Candidate | Graph
69 data Ngrams = Ngrams { _ngramsNgrams :: [Text]
70 , _ngramsStem :: [Text]
71 , _ngramsListName :: Maybe ListName
74 equivNgrams :: Ngrams -> Ngrams -> Bool
75 equivNgrams (Ngrams n1 s1 _) (Ngrams n2 s2 _)
76 = (sort n1) == (sort n2) || (sort s1) == (sort s2)
81 -- Data Ngrams = Monograms | MultiGrams
83 ngrams :: Text -> [Text]
84 ngrams xs = monograms $ toLower $ filter isGram xs
87 clean txt = DT.map clean' txt
92 monograms :: Text -> [Text]
93 monograms txt = split isWord txt
95 isWord c = c `elem` [' ', '\'', ',', ';']
97 isGram :: Char -> Bool
98 isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
100 -- | Compute the occurrences (occ)
101 occ :: Ord a => [a] -> Map a Occ
102 occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
104 -- TODO add groups and filter stops
105 sumOcc :: Ord a => [Map a Occ] -> Map a Occ
106 sumOcc xs = foldl' (unionWith (+)) empty xs
108 --noApax :: Ord a => Map a Occ -> Map a Occ
109 --noApax m = M.filter (>1) m
111 -- | /!\ indexes are not the same:
113 -- | Index ngrams from Map
114 --indexNgram :: Ord a => Map a Occ -> Map Index a
115 --indexNgram m = fromList (zip [1..] (keys m))
117 -- | Index ngrams from Map
118 --ngramIndex :: Ord a => Map a Occ -> Map a Index
119 --ngramIndex m = fromList (zip (keys m) [1..])
121 indexWith :: Ord a => Map a Occ -> [a] -> [Int]
122 indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
124 indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
127 m = sumOcc (map occ xs)
128 is = map (indexWith m) xs
130 list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
131 list2fis n xs = (m', fs)
134 m' = M.filter (>50000) m
137 text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
138 text2fis n xs = list2fis n (map ngrams xs)
140 --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
141 --text2fisWith = undefined
143 -------------------------------------------------------------------
146 sentences :: Text -> [Text]
147 sentences txt = split isStop txt
149 isStop :: Char -> Bool
150 isStop c = c `elem` ['.','?','!']
156 txt = concat <$> lines <$> clean <$> readFile fp
157 -- | Number of sentences
158 ls = sentences <$> txt
159 -- | Number of monograms used in the full text