]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Metrics/FrequentItemSet.hs
[conduit] attempt to fix length of parsed docs [does not compile]
[gargantext.git] / src / Gargantext / Core / Text / Metrics / FrequentItemSet.hs
1 {-|
2 Module : Gargantext.Core.Text.Metrics.FrequentItemSet
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Domain Specific Language to manage Frequent Item Set (FIS)
11
12 -}
13
14
15 module Gargantext.Core.Text.Metrics.FrequentItemSet
16 ( Fis, Size(..)
17 , occ_hlcm, cooc_hlcm
18 , allFis, between
19 , fisWithSize
20 , fisWith
21 , fisWithSizePoly
22 , fisWithSizePoly2
23 , fisWithSizePolyMap
24 , fisWithSizePolyMap'
25 , module HLCM
26 )
27 where
28
29 import Data.List (concat, null)
30 import Data.Map.Strict (Map)
31 import Data.Maybe (catMaybes, isNothing)
32 import Data.Set (Set)
33 import Gargantext.Prelude
34 import HLCM
35 import qualified Data.Map.Strict as Map
36 import qualified Data.Set as Set
37 import qualified Data.Vector as V
38
39 import Control.Monad (sequence)
40
41 data Size = Point Int | Segment Int Int
42
43 ------------------------------------------------------------------------
44 -- | Occurrence is Frequent Item Set of size 1
45 occ_hlcm :: Frequency -> [[Item]] -> [Fis]
46 occ_hlcm = fisWithSize (Point 1)
47
48 -- | Cooccurrence is Frequent Item Set of size 2
49 cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
50 cooc_hlcm = fisWithSize (Point 2)
51
52 allFis :: Frequency -> [[Item]] -> [Fis]
53 allFis = fisWith Nothing
54
55 ------------------------------------------------------------------------
56 between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
57 between (x,y) = fisWithSize (Segment x y)
58
59 --maximum :: Int -> Frequency -> [[Item]] -> [Fis]
60 --maximum m = between (0,m)
61
62
63 ------------------------------------------------------------------------
64 ------------------------------------------------------------------------
65 -- | Data type to type the Frequent Item Set
66 -- TODO replace List with Set in fisItemSet
67 -- be careful : risks to erase HLCM behavior
68 type Fis = Fis' Item
69 data Fis' a = Fis' { _fisCount :: Int
70 , _fisItemSet :: [a]
71 } deriving (Show)
72
73 instance Functor Fis' where
74 fmap f (Fis' c is) = Fis' c (fmap f is)
75
76 -- | Sugar from items to FIS
77 items2fis :: [Item] -> Maybe Fis
78 items2fis [] = Nothing
79 items2fis (i:is) = Just $ Fis' i is
80
81 ------------------------------------------------------------------------
82 ------------------------------------------------------------------------
83
84 fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
85 fisWithSize n f is = case n of
86 Point n' -> fisWith (Just (\x -> length x == (n'+1) )) f is
87 Segment a b -> fisWith (Just (\x -> cond a (length x) b)) f is
88 where
89 cond a' x b' = a' <= x && x <= b'
90
91
92 --- Filter on Fis and not on [Item]
93 fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis]
94 fisWith s f is = case filter (not . null) is of
95 [] -> []
96 js -> catMaybes $ map items2fis $ filter' $ runLCMmatrix js f
97 -- drop unMaybe
98 where
99 filter' = case s of
100 Nothing -> identity
101 Just fun -> filter fun
102
103 -- Here the sole purpose to take the keys as a Set is tell we do not want
104 -- duplicates.
105 fisWithSizePoly :: Ord a => Size -> Frequency -> Set a -> [[a]] -> [Fis' a]
106 fisWithSizePoly n f ks = map (fmap fromItem) . fisWithSize n f . map (map toItem)
107 where
108 ksv = V.fromList $ Set.toList ks
109 ksm = Map.fromList . flip zip [0..] $ V.toList ksv
110 toItem = (ksm Map.!)
111 fromItem = (ksv V.!)
112
113 fisWithSizePoly2 :: Ord a => Size -> Frequency -> [[a]] -> [Fis' a]
114 fisWithSizePoly2 n f is = fisWithSizePoly n f ks is
115 where
116 ks = Set.fromList $ concat is
117
118 fisWithSizePolyMap :: Ord a => Size -> Frequency -> [[a]] -> Map (Set a) Int
119 fisWithSizePolyMap n f is =
120 Map.fromList $ (\i -> (Set.fromList (_fisItemSet i), _fisCount i)) <$> fisWithSizePoly2 n f is
121
122
123 ------------------------------------------------------------------------
124 ------------------------------------------------------------------------
125
126 ---- Weighted [[Item]]
127
128 isSublistOf :: Ord a => [a] -> [a] -> Bool
129 isSublistOf sub lst = all (\i -> elem i lst) sub
130
131 reIndexFis :: Ord a => [([a],(b,c))] -> [Fis' a] -> [(Fis' a,([b],[c]))]
132 reIndexFis items fis = map (\f ->
133 let docs = filter (\(lst,_) -> isSublistOf (_fisItemSet f) lst) items
134 in (f, (map (fst . snd) docs, map (snd . snd) docs))) fis
135
136 wsum :: [Maybe Double] -> Int -> Maybe Double
137 wsum lst sup =
138 let w = fmap sum $ sequence lst
139 in
140 if (isNothing w)
141 then Just $ fromIntegral sup
142 else w
143
144 fisWithSizePolyMap' :: Ord a => Size -> Frequency -> [([a], (Maybe Double,[Int]))] -> Map (Set a) (Int, (Maybe Double,[Int]))
145 fisWithSizePolyMap' n f is = Map.fromList
146 $ map (\(fis,(ws,sources)) -> (Set.fromList (_fisItemSet fis),(_fisCount fis,(wsum ws (_fisCount fis),concat sources))))
147 $ reIndexFis is
148 $ fisWithSizePoly2 n f (map fst is)
149
150 ------------------------------------------------------------------------
151 ------------------------------------------------------------------------
152
153
154 --
155 ---- | /!\ indexes are not the same:
156 --
157 ---- | Index ngrams from Map
158 ----indexNgram :: Ord a => Map a Occ -> Map Index a
159 ----indexNgram m = fromList (zip [1..] (keys m))
160 --
161 ---- | Index ngrams from Map
162 ----ngramIndex :: Ord a => Map a Occ -> Map a Index
163 ----ngramIndex m = fromList (zip (keys m) [1..])
164 --
165 --indexWith :: Ord a => Map a Occ -> [a] -> [Int]
166 --indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
167 --
168 --indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
169 --indexIt xs = (m, is)
170 -- where
171 -- m = sumOcc (map occ xs)
172 -- is = map (indexWith m) xs
173 --
174 --list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
175 --list2fis n xs = (m', fs)
176 -- where
177 -- (m, is) = indexIt xs
178 -- m' = M.filter (>50000) m
179 -- fs = FIS.all n is
180 --
181 --text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
182 --text2fis n xs = list2fis n (map terms xs)
183 --
184 ----text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
185 ----text2fisWith = undefined
186 --
187