]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams.hs
[FIX] Count improving type.
[gargantext.git] / src / Gargantext / Ngrams.hs
1 {-|
2 Module : Gargantext.Ngrams
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams exctration.
11
12 Definitions of ngrams.
13 n non negative integer
14
15 -}
16
17 module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
18 --, module Gargantext.Ngrams.Hetero
19 , module Gargantext.Ngrams.CoreNLP
20 , module Gargantext.Ngrams.Parser
21 , module Gargantext.Ngrams.Occurrences
22 , module Gargantext.Ngrams.TextMining
23 , module Gargantext.Ngrams.Metrics
24 , Ngrams(..), ngrams, occ, sumOcc, text2fis
25 , NgramsList(..)
26 --, module Gargantext.Ngrams.Words
27 ) where
28
29 import Gargantext.Ngrams.Letters
30 --import Gargantext.Ngrams.Hetero
31 import Gargantext.Ngrams.CoreNLP
32 import Gargantext.Ngrams.Parser
33
34 import Gargantext.Ngrams.Occurrences
35 import Gargantext.Ngrams.TextMining
36 --import Gargantext.Ngrams.Words
37
38 import Gargantext.Ngrams.Metrics
39 import qualified Gargantext.Ngrams.FrequentItemSet as FIS
40 -----------------------------------------------------------------
41
42 import Data.List (sort)
43 import Data.Char (Char, isAlpha, isSpace)
44 import Data.Text (Text, words, filter, toLower)
45 import Data.Map.Strict (Map
46 , empty
47 , insertWith, unionWith
48 , lookupIndex
49 --, fromList, keys
50 )
51 import qualified Data.Map.Strict as M (filter)
52 import Data.Foldable (foldl')
53 import Gargantext.Prelude hiding (filter)
54
55 -- Maybe useful later:
56 --import NLP.Stemmer (stem, Stemmer(..))
57 --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
58 --import Language.Aspell.Options (ACOption(..))
59
60
61 data NgramsList = Stop | Candidate | Graph
62 deriving (Show, Eq)
63
64 data Ngrams = Ngrams { _ngramsNgrams :: [Text]
65 , _ngramsStem :: [Text]
66 , _ngramsList :: Maybe NgramsList
67 } deriving (Show)
68
69 instance Eq Ngrams where
70 Ngrams n1 s1 _ == Ngrams n2 s2 _ = (sort n1) == (sort n2) || (sort s1) == (sort s2)
71
72 type Occ = Int
73 --type Index = Int
74
75 ngrams :: Text -> [Text]
76 ngrams xs = monograms $ toLower $ filter isChar xs
77
78 monograms :: Text -> [Text]
79 monograms = words
80
81 -- TODO
82 -- 12-b
83 isChar :: Char -> Bool
84 isChar '-' = True
85 isChar '/' = True
86 isChar c = isAlpha c || isSpace c
87
88 -- | Compute the occurrences (occ)
89 occ :: Ord a => [a] -> Map a Occ
90 occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
91
92 -- TODO add groups and filter stops
93 sumOcc :: Ord a => [Map a Occ] -> Map a Occ
94 sumOcc xs = foldl' (\x y -> unionWith (+) x y) empty xs
95
96 --noApax :: Ord a => Map a Occ -> Map a Occ
97 --noApax m = M.filter (>1) m
98
99 -- | /!\ indexes are not the same:
100
101 -- | Index ngrams from Map
102 --indexNgram :: Ord a => Map a Occ -> Map Index a
103 --indexNgram m = fromList (zip [1..] (keys m))
104
105 -- | Index ngrams from Map
106 --ngramIndex :: Ord a => Map a Occ -> Map a Index
107 --ngramIndex m = fromList (zip (keys m) [1..])
108
109 indexWith :: Ord a => Map a Occ -> [a] -> [Int]
110 indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
111
112 indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
113 indexIt xs = (m, is)
114 where
115 m = sumOcc (map occ xs)
116 is = map (indexWith m) xs
117
118 list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
119 list2fis n xs = (m', fs)
120 where
121 (m, is) = indexIt xs
122 m' = M.filter (>50000) m
123 fs = FIS.all n is
124
125 text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
126 text2fis n xs = list2fis n (map ngrams xs)
127
128 --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
129 --text2fisWith = undefined
130
131