]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text.hs
[Structure] Ngrams -> Text.
[gargantext.git] / src / Gargantext / Text.hs
1 {-|
2 Module : Gargantext.Text
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams exctration.
11
12 Definitions of ngrams.
13 n non negative integer
14
15 -}
16
17 {-# LANGUAGE NoImplicitPrelude #-}
18
19 module Gargantext.Text ( module Gargantext.Text.Letters
20 --, module Gargantext.Text.Hetero
21 , module Gargantext.Text.CoreNLP
22 , module Gargantext.Text.Parser
23 , module Gargantext.Text.Occurrences
24 , module Gargantext.Text.TextMining
25 , module Gargantext.Text.Metrics
26 , Ngrams(..), ngrams, occ, sumOcc, text2fis, clean
27 , ListName(..), equivNgrams, isGram, sentences
28 , ngramsTest
29 ) where
30
31 import Gargantext.Text.Letters
32 --import Gargantext.Text.Hetero
33 import Gargantext.Text.CoreNLP
34 import Gargantext.Text.Parser
35
36 import Gargantext.Text.Occurrences
37 import Gargantext.Text.TextMining
38 --import Gargantext.Text.Words
39
40 import Gargantext.Text.Metrics
41 import qualified Gargantext.Text.FrequentItemSet as FIS
42 -----------------------------------------------------------------
43
44 import Data.List (sort)
45 import Data.Char (Char, isAlphaNum, isSpace)
46 import Data.Text (Text, filter, toLower, split, lines, concat)
47 import qualified Data.Text as DT
48 import Data.Text.IO (readFile)
49
50 import Data.Map.Strict (Map
51 , empty
52 , insertWith, unionWith
53 , lookupIndex
54 --, fromList, keys
55 )
56 import qualified Data.Map.Strict as M (filter)
57 import Data.Foldable (foldl')
58 import Gargantext.Prelude hiding (filter)
59
60 -- Maybe useful later:
61 --import NLP.Stemmer (stem, Stemmer(..))
62 --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
63 --import Language.Aspell.Options (ACOption(..))
64
65
66 data ListName = Stop | Candidate | Graph
67 deriving (Show, Eq)
68
69 data Ngrams = Ngrams { _ngramsNgrams :: [Text]
70 , _ngramsStem :: [Text]
71 , _ngramsListName :: Maybe ListName
72 } deriving (Show)
73
74 equivNgrams :: Ngrams -> Ngrams -> Bool
75 equivNgrams (Ngrams n1 s1 _) (Ngrams n2 s2 _)
76 = (sort n1) == (sort n2) || (sort s1) == (sort s2)
77
78 type Occ = Int
79 --type Index = Int
80
81 -- Data Ngrams = Monograms | MultiGrams
82
83 ngrams :: Text -> [Text]
84 ngrams xs = monograms $ toLower $ filter isGram xs
85
86 clean :: Text -> Text
87 clean txt = DT.map clean' txt
88 where
89 clean' '’' = '\''
90 clean' c = c
91
92 monograms :: Text -> [Text]
93 monograms txt = split isWord txt
94 where
95 isWord c = c `elem` [' ', '\'', ',', ';']
96
97 isGram :: Char -> Bool
98 isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
99
100 -- | Compute the occurrences (occ)
101 occ :: Ord a => [a] -> Map a Occ
102 occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
103
104 -- TODO add groups and filter stops
105 sumOcc :: Ord a => [Map a Occ] -> Map a Occ
106 sumOcc xs = foldl' (unionWith (+)) empty xs
107
108 --noApax :: Ord a => Map a Occ -> Map a Occ
109 --noApax m = M.filter (>1) m
110
111 -- | /!\ indexes are not the same:
112
113 -- | Index ngrams from Map
114 --indexNgram :: Ord a => Map a Occ -> Map Index a
115 --indexNgram m = fromList (zip [1..] (keys m))
116
117 -- | Index ngrams from Map
118 --ngramIndex :: Ord a => Map a Occ -> Map a Index
119 --ngramIndex m = fromList (zip (keys m) [1..])
120
121 indexWith :: Ord a => Map a Occ -> [a] -> [Int]
122 indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
123
124 indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
125 indexIt xs = (m, is)
126 where
127 m = sumOcc (map occ xs)
128 is = map (indexWith m) xs
129
130 list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
131 list2fis n xs = (m', fs)
132 where
133 (m, is) = indexIt xs
134 m' = M.filter (>50000) m
135 fs = FIS.all n is
136
137 text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
138 text2fis n xs = list2fis n (map ngrams xs)
139
140 --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
141 --text2fisWith = undefined
142
143 -------------------------------------------------------------------
144 -- Contexts of text
145
146 sentences :: Text -> [Text]
147 sentences txt = split isStop txt
148
149 isStop :: Char -> Bool
150 isStop c = c `elem` ['.','?','!']
151
152
153 -- | Tests
154 ngramsTest fp = ws
155 where
156 txt = concat <$> lines <$> clean <$> readFile fp
157 -- | Number of sentences
158 ls = sentences <$> txt
159 -- | Number of monograms used in the full text
160 ws = ngrams <$> txt
161 -- | stem ngrams
162 -- TODO
163 -- group ngrams
164 ocs = occ <$> ws
165
166 --
167
168