2 Module : Gargantext.Core.Text.Metrics.FrequentItemSet
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Domain Specific Language to manage Frequent Item Set (FIS)
15 module Gargantext.Core.Text.Metrics.FrequentItemSet
28 import Data.List (concat, null)
29 import Data.Map.Strict (Map)
30 import Data.Maybe (catMaybes)
32 import Gargantext.Prelude
34 import qualified Data.Map.Strict as Map
35 import qualified Data.Set as Set
36 import qualified Data.Vector as V
38 data Size = Point Int | Segment Int Int
40 ------------------------------------------------------------------------
41 -- | Occurrence is Frequent Item Set of size 1
42 occ_hlcm :: Frequency -> [[Item]] -> [Fis]
43 occ_hlcm = fisWithSize (Point 1)
45 -- | Cooccurrence is Frequent Item Set of size 2
46 cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
47 cooc_hlcm = fisWithSize (Point 2)
49 allFis :: Frequency -> [[Item]] -> [Fis]
50 allFis = fisWith Nothing
52 ------------------------------------------------------------------------
53 between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
54 between (x,y) = fisWithSize (Segment x y)
56 --maximum :: Int -> Frequency -> [[Item]] -> [Fis]
57 --maximum m = between (0,m)
60 ------------------------------------------------------------------------
61 ------------------------------------------------------------------------
62 -- | Data type to type the Frequent Item Set
63 -- TODO replace List with Set in fisItemSet
64 -- be careful : risks to erase HLCM behavior
66 data Fis' a = Fis' { _fisCount :: Int
70 instance Functor Fis' where
71 fmap f (Fis' c is) = Fis' c (fmap f is)
73 -- | Sugar from items to FIS
74 items2fis :: [Item] -> Maybe Fis
75 items2fis [] = Nothing
76 items2fis (i:is) = Just $ Fis' i is
78 ------------------------------------------------------------------------
79 ------------------------------------------------------------------------
81 fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
82 fisWithSize n f is = case n of
83 Point n' -> fisWith (Just (\x -> length x == (n'+1) )) f is
84 Segment a b -> fisWith (Just (\x -> cond a (length x) b)) f is
86 cond a' x b' = a' <= x && x <= b'
89 --- Filter on Fis and not on [Item]
90 fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis]
91 fisWith s f is = case filter (not . null) is of
93 js -> catMaybes $ map items2fis $ filter' $ runLCMmatrix js f
98 Just fun -> filter fun
100 -- Here the sole purpose to take the keys as a Set is tell we do not want
102 fisWithSizePoly :: Ord a => Size -> Frequency -> Set a -> [[a]] -> [Fis' a]
103 fisWithSizePoly n f ks = map (fmap fromItem) . fisWithSize n f . map (map toItem)
105 ksv = V.fromList $ Set.toList ks
106 ksm = Map.fromList . flip zip [0..] $ V.toList ksv
110 fisWithSizePoly2 :: Ord a => Size -> Frequency -> [[a]] -> [Fis' a]
111 fisWithSizePoly2 n f is = fisWithSizePoly n f ks is
113 ks = Set.fromList $ concat is
115 fisWithSizePolyMap :: Ord a => Size -> Frequency -> [[a]] -> Map (Set a) Int
116 fisWithSizePolyMap n f is =
117 Map.fromList $ (\i -> (Set.fromList (_fisItemSet i), _fisCount i)) <$> fisWithSizePoly2 n f is
120 ------------------------------------------------------------------------
121 ------------------------------------------------------------------------
125 ---- | /!\ indexes are not the same:
127 ---- | Index ngrams from Map
128 ----indexNgram :: Ord a => Map a Occ -> Map Index a
129 ----indexNgram m = fromList (zip [1..] (keys m))
131 ---- | Index ngrams from Map
132 ----ngramIndex :: Ord a => Map a Occ -> Map a Index
133 ----ngramIndex m = fromList (zip (keys m) [1..])
135 --indexWith :: Ord a => Map a Occ -> [a] -> [Int]
136 --indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
138 --indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
139 --indexIt xs = (m, is)
141 -- m = sumOcc (map occ xs)
142 -- is = map (indexWith m) xs
144 --list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
145 --list2fis n xs = (m', fs)
147 -- (m, is) = indexIt xs
148 -- m' = M.filter (>50000) m
151 --text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
152 --text2fis n xs = list2fis n (map terms xs)
154 ----text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
155 ----text2fisWith = undefined