]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Metrics/FrequentItemSet.hs
merge done
[gargantext.git] / src / Gargantext / Core / Text / Metrics / FrequentItemSet.hs
1 {-|
2 Module : Gargantext.Core.Text.Metrics.FrequentItemSet
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Domain Specific Language to manage Frequent Item Set (FIS)
11
12 -}
13
14
15 module Gargantext.Core.Text.Metrics.FrequentItemSet
16 ( Fis, Size(..)
17 , occ_hlcm, cooc_hlcm
18 , allFis, between
19 , fisWithSize
20 , fisWith
21 , fisWithSizePoly
22 , fisWithSizePoly2
23 , fisWithSizePolyMap
24 , fisWithSizePolyMap'
25 , module HLCM
26 )
27 where
28
29 import Data.List (concat, null)
30 import Data.Map.Strict (Map)
31 import Data.Maybe (catMaybes)
32 import Data.Set (Set)
33 import Gargantext.Prelude
34 import HLCM
35 import qualified Data.Map.Strict as Map
36 import qualified Data.Set as Set
37 import qualified Data.Vector as V
38
39 import Control.Monad (sequence)
40
41 data Size = Point Int | Segment Int Int
42
43 ------------------------------------------------------------------------
44 -- | Occurrence is Frequent Item Set of size 1
45 occ_hlcm :: Frequency -> [[Item]] -> [Fis]
46 occ_hlcm = fisWithSize (Point 1)
47
48 -- | Cooccurrence is Frequent Item Set of size 2
49 cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
50 cooc_hlcm = fisWithSize (Point 2)
51
52 allFis :: Frequency -> [[Item]] -> [Fis]
53 allFis = fisWith Nothing
54
55 ------------------------------------------------------------------------
56 between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
57 between (x,y) = fisWithSize (Segment x y)
58
59 --maximum :: Int -> Frequency -> [[Item]] -> [Fis]
60 --maximum m = between (0,m)
61
62
63 ------------------------------------------------------------------------
64 ------------------------------------------------------------------------
65 -- | Data type to type the Frequent Item Set
66 -- TODO replace List with Set in fisItemSet
67 -- be careful : risks to erase HLCM behavior
68 type Fis = Fis' Item
69 data Fis' a = Fis' { _fisCount :: Int
70 , _fisItemSet :: [a]
71 } deriving (Show)
72
73 instance Functor Fis' where
74 fmap f (Fis' c is) = Fis' c (fmap f is)
75
76 -- | Sugar from items to FIS
77 items2fis :: [Item] -> Maybe Fis
78 items2fis [] = Nothing
79 items2fis (i:is) = Just $ Fis' i is
80
81 ------------------------------------------------------------------------
82 ------------------------------------------------------------------------
83
84 fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
85 fisWithSize n f is = case n of
86 Point n' -> fisWith (Just (\x -> length x == (n'+1) )) f is
87 Segment a b -> fisWith (Just (\x -> cond a (length x) b)) f is
88 where
89 cond a' x b' = a' <= x && x <= b'
90
91
92 --- Filter on Fis and not on [Item]
93 fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis]
94 fisWith s f is = case filter (not . null) is of
95 [] -> []
96 js -> catMaybes $ map items2fis $ filter' $ runLCMmatrix js f
97 -- drop unMaybe
98 where
99 filter' = case s of
100 Nothing -> identity
101 Just fun -> filter fun
102
103 -- Here the sole purpose to take the keys as a Set is tell we do not want
104 -- duplicates.
105 fisWithSizePoly :: Ord a => Size -> Frequency -> Set a -> [[a]] -> [Fis' a]
106 fisWithSizePoly n f ks = map (fmap fromItem) . fisWithSize n f . map (map toItem)
107 where
108 ksv = V.fromList $ Set.toList ks
109 ksm = Map.fromList . flip zip [0..] $ V.toList ksv
110 toItem = (ksm Map.!)
111 fromItem = (ksv V.!)
112
113 fisWithSizePoly2 :: Ord a => Size -> Frequency -> [[a]] -> [Fis' a]
114 fisWithSizePoly2 n f is = fisWithSizePoly n f ks is
115 where
116 ks = Set.fromList $ concat is
117
118 fisWithSizePolyMap :: Ord a => Size -> Frequency -> [[a]] -> Map (Set a) Int
119 fisWithSizePolyMap n f is =
120 Map.fromList $ (\i -> (Set.fromList (_fisItemSet i), _fisCount i)) <$> fisWithSizePoly2 n f is
121
122
123 ------------------------------------------------------------------------
124 ------------------------------------------------------------------------
125
126 ---- Weighted [[Item]]
127
128 isSublistOf :: Ord a => [a] -> [a] -> Bool
129 isSublistOf sub lst = all (\i -> elem i lst) sub
130
131 reIndexFis :: Ord a => [([a],b)] -> [Fis' a] -> [(Fis' a,[b])]
132 reIndexFis items fis = map (\f ->
133 let docs = filter (\(lst,_) -> isSublistOf (_fisItemSet f) lst) items
134 in (f, map snd docs)) fis
135
136 wsum :: [Maybe Double] -> Maybe Double
137 wsum lst = fmap sum $ sequence lst
138
139 fisWithSizePolyMap' :: Ord a => Size -> Frequency -> [([a],Maybe Double)] -> Map (Set a) (Int, Maybe Double)
140 fisWithSizePolyMap' n f is = Map.fromList
141 $ map (\(fis,ws) -> (Set.fromList (_fisItemSet fis),(_fisCount fis,(wsum ws))))
142 $ reIndexFis is
143 $ fisWithSizePoly2 n f (map fst is)
144
145 ------------------------------------------------------------------------
146 ------------------------------------------------------------------------
147
148
149 --
150 ---- | /!\ indexes are not the same:
151 --
152 ---- | Index ngrams from Map
153 ----indexNgram :: Ord a => Map a Occ -> Map Index a
154 ----indexNgram m = fromList (zip [1..] (keys m))
155 --
156 ---- | Index ngrams from Map
157 ----ngramIndex :: Ord a => Map a Occ -> Map a Index
158 ----ngramIndex m = fromList (zip (keys m) [1..])
159 --
160 --indexWith :: Ord a => Map a Occ -> [a] -> [Int]
161 --indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
162 --
163 --indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
164 --indexIt xs = (m, is)
165 -- where
166 -- m = sumOcc (map occ xs)
167 -- is = map (indexWith m) xs
168 --
169 --list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
170 --list2fis n xs = (m', fs)
171 -- where
172 -- (m, is) = indexIt xs
173 -- m' = M.filter (>50000) m
174 -- fs = FIS.all n is
175 --
176 --text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
177 --text2fis n xs = list2fis n (map terms xs)
178 --
179 ----text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
180 ----text2fisWith = undefined
181 --
182