]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Metrics/FrequentItemSet.hs
[ngrams] add score update endpoint + sorting
[gargantext.git] / src / Gargantext / Core / Text / Metrics / FrequentItemSet.hs
1 {-|
2 Module : Gargantext.Core.Text.Metrics.FrequentItemSet
3 Description : Ngrams tools
4 Copyright : (c) CNRS, 2018
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Domain Specific Language to manage Frequent Item Set (FIS)
11
12 -}
13
14
15 module Gargantext.Core.Text.Metrics.FrequentItemSet
16 ( Fis, Size(..)
17 , occ_hlcm, cooc_hlcm
18 , allFis, between
19 , fisWithSize
20 , fisWith
21 , fisWithSizePoly
22 , fisWithSizePoly2
23 , fisWithSizePolyMap
24 , module HLCM
25 )
26 where
27
28 import Data.List (concat, null)
29 import Data.Map.Strict (Map)
30 import Data.Maybe (catMaybes)
31 import Data.Set (Set)
32 import Gargantext.Prelude
33 import HLCM
34 import qualified Data.Map.Strict as Map
35 import qualified Data.Set as Set
36 import qualified Data.Vector as V
37
38 data Size = Point Int | Segment Int Int
39
40 ------------------------------------------------------------------------
41 -- | Occurrence is Frequent Item Set of size 1
42 occ_hlcm :: Frequency -> [[Item]] -> [Fis]
43 occ_hlcm = fisWithSize (Point 1)
44
45 -- | Cooccurrence is Frequent Item Set of size 2
46 cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
47 cooc_hlcm = fisWithSize (Point 2)
48
49 allFis :: Frequency -> [[Item]] -> [Fis]
50 allFis = fisWith Nothing
51
52 ------------------------------------------------------------------------
53 between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
54 between (x,y) = fisWithSize (Segment x y)
55
56 --maximum :: Int -> Frequency -> [[Item]] -> [Fis]
57 --maximum m = between (0,m)
58
59
60 ------------------------------------------------------------------------
61 ------------------------------------------------------------------------
62 -- | Data type to type the Frequent Item Set
63 -- TODO replace List with Set in fisItemSet
64 -- be careful : risks to erase HLCM behavior
65 type Fis = Fis' Item
66 data Fis' a = Fis' { _fisCount :: Int
67 , _fisItemSet :: [a]
68 } deriving (Show)
69
70 instance Functor Fis' where
71 fmap f (Fis' c is) = Fis' c (fmap f is)
72
73 -- | Sugar from items to FIS
74 items2fis :: [Item] -> Maybe Fis
75 items2fis [] = Nothing
76 items2fis (i:is) = Just $ Fis' i is
77
78 ------------------------------------------------------------------------
79 ------------------------------------------------------------------------
80
81 fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
82 fisWithSize n f is = case n of
83 Point n' -> fisWith (Just (\x -> length x == (n'+1) )) f is
84 Segment a b -> fisWith (Just (\x -> cond a (length x) b)) f is
85 where
86 cond a' x b' = a' <= x && x <= b'
87
88
89 --- Filter on Fis and not on [Item]
90 fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis]
91 fisWith s f is = case filter (not . null) is of
92 [] -> []
93 js -> catMaybes $ map items2fis $ filter' $ runLCMmatrix js f
94 -- drop unMaybe
95 where
96 filter' = case s of
97 Nothing -> identity
98 Just fun -> filter fun
99
100 -- Here the sole purpose to take the keys as a Set is tell we do not want
101 -- duplicates.
102 fisWithSizePoly :: Ord a => Size -> Frequency -> Set a -> [[a]] -> [Fis' a]
103 fisWithSizePoly n f ks = map (fmap fromItem) . fisWithSize n f . map (map toItem)
104 where
105 ksv = V.fromList $ Set.toList ks
106 ksm = Map.fromList . flip zip [0..] $ V.toList ksv
107 toItem = (ksm Map.!)
108 fromItem = (ksv V.!)
109
110 fisWithSizePoly2 :: Ord a => Size -> Frequency -> [[a]] -> [Fis' a]
111 fisWithSizePoly2 n f is = fisWithSizePoly n f ks is
112 where
113 ks = Set.fromList $ concat is
114
115 fisWithSizePolyMap :: Ord a => Size -> Frequency -> [[a]] -> Map (Set a) Int
116 fisWithSizePolyMap n f is =
117 Map.fromList $ (\i -> (Set.fromList (_fisItemSet i), _fisCount i)) <$> fisWithSizePoly2 n f is
118
119
120 ------------------------------------------------------------------------
121 ------------------------------------------------------------------------
122
123
124 --
125 ---- | /!\ indexes are not the same:
126 --
127 ---- | Index ngrams from Map
128 ----indexNgram :: Ord a => Map a Occ -> Map Index a
129 ----indexNgram m = fromList (zip [1..] (keys m))
130 --
131 ---- | Index ngrams from Map
132 ----ngramIndex :: Ord a => Map a Occ -> Map a Index
133 ----ngramIndex m = fromList (zip (keys m) [1..])
134 --
135 --indexWith :: Ord a => Map a Occ -> [a] -> [Int]
136 --indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
137 --
138 --indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
139 --indexIt xs = (m, is)
140 -- where
141 -- m = sumOcc (map occ xs)
142 -- is = map (indexWith m) xs
143 --
144 --list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
145 --list2fis n xs = (m', fs)
146 -- where
147 -- (m, is) = indexIt xs
148 -- m' = M.filter (>50000) m
149 -- fs = FIS.all n is
150 --
151 --text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
152 --text2fis n xs = list2fis n (map terms xs)
153 --
154 ----text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
155 ----text2fisWith = undefined
156 --
157