]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams.hs
[FIS] Frequent Item Set DSL.
[gargantext.git] / src / Gargantext / Ngrams.hs
1 {-|
2 Module : Gargantext.Ngrams
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams exctration.
11
12 Definitions of ngrams.
13 n non negative integer
14
15 -}
16
17 module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
18 --, module Gargantext.Ngrams.Hetero
19 , module Gargantext.Ngrams.CoreNLP
20 , module Gargantext.Ngrams.Parser
21 , module Gargantext.Ngrams.Occurrences
22 , module Gargantext.Ngrams.TextMining
23 , module Gargantext.Ngrams.Metrics
24 , ngrams, occ, sumOcc, text2fis
25 --, module Gargantext.Ngrams.Words
26 ) where
27
28 import Gargantext.Ngrams.Letters
29 --import Gargantext.Ngrams.Hetero
30 import Gargantext.Ngrams.CoreNLP
31 import Gargantext.Ngrams.Parser
32
33 import Gargantext.Ngrams.Occurrences
34 import Gargantext.Ngrams.TextMining
35 --import Gargantext.Ngrams.Words
36
37 import Gargantext.Ngrams.Metrics
38 import qualified Gargantext.Ngrams.FrequentItemSet as FIS
39 -----------------------------------------------------------------
40
41 import Data.Char (Char, isAlpha, isSpace)
42 import Data.Text (Text, words, filter, toLower)
43 import Data.Map.Strict (Map, empty, keys
44 , insertWith, unionWith
45 , fromList
46 , lookupIndex
47 )
48 import qualified Data.Map.Strict as M (filter)
49 import Data.Foldable (foldl')
50 import Gargantext.Prelude hiding (filter)
51 import qualified Data.List as L (filter)
52
53 -- Maybe useful later:
54 --import NLP.Stemmer (stem, Stemmer(..))
55 --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
56 --import Language.Aspell.Options (ACOption(..))
57
58 type Occ = Int
59 type Index = Int
60 type FreqMin = Int
61
62 ngrams :: Text -> [Text]
63 ngrams xs = monograms $ toLower $ filter isGram xs
64
65 monograms :: Text -> [Text]
66 monograms = words
67
68 isGram :: Char -> Bool
69 isGram '-' = True
70 isGram '/' = True
71 isGram c = isAlpha c || isSpace c
72
73 -- | Compute the occurrences (occ)
74 occ :: Ord a => [a] -> Map a Occ
75 occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
76
77 -- TODO add groups and filter stops
78 sumOcc :: Ord a => [Map a Occ] -> Map a Occ
79 sumOcc xs = foldl' (\x y -> unionWith (+) x y) empty xs
80
81 noApax :: Ord a => Map a Occ -> Map a Occ
82 noApax m = M.filter (>1) m
83
84 -- | /!\ indexes are not the same:
85
86 -- | Index ngrams from Map
87 indexNgram :: Ord a => Map a Occ -> Map Index a
88 indexNgram m = fromList (zip [1..] (keys m))
89
90 -- | Index ngrams from Map
91 ngramIndex :: Ord a => Map a Occ -> Map a Index
92 ngramIndex m = fromList (zip (keys m) [1..])
93
94 indexWith :: Ord a => Map a Occ -> [a] -> [Int]
95 indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
96
97 indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
98 indexIt xs = (m, is)
99 where
100 m = sumOcc (map occ xs)
101 is = map (indexWith m) xs
102
103 list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
104 list2fis n xs = (m, fs)
105 where
106 (m, is) = indexIt xs
107 fs = FIS.all n is
108
109 text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
110 text2fis n xs = list2fis n (map ngrams xs)
111
112 text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
113 text2fisWith = undefined
114