]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics.hs
[FLOW][DB][NGRAMS][Lists] Done, need to be tested with real data.
[gargantext.git] / src / Gargantext / Text / Metrics.hs
1 {-|
2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Mainly reexport functions in @Data.Text.Metrics@
11
12 -}
13
14 {-# LANGUAGE BangPatterns #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17
18 module Gargantext.Text.Metrics
19 where
20
21 import Data.Ord (Down(..))
22 import qualified Data.List as L
23
24 import Data.Map (Map)
25 import qualified Data.Map as M
26
27 --import Math.KMeans (kmeans, euclidSq, elements)
28
29 import Gargantext.Prelude
30 import Gargantext.Viz.Graph.Distances.Matrice
31 import Gargantext.Viz.Graph.Index
32
33 import qualified Data.Array.Accelerate.Interpreter as DAA
34 import qualified Data.Array.Accelerate as DAA
35 -- import Data.Array.Accelerate ((:.)(..), Z(..))
36
37 import GHC.Real (round)
38
39 import Debug.Trace (trace)
40
41 data MapListSize = MapListSize Int
42 data InclusionSize = InclusionSize Int
43 data SampleBins = SampleBins Double
44 data Clusters = Clusters Int
45 data DefaultValue = DefaultValue Int
46
47 data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
48 , fc_inclusionSize :: InclusionSize
49 , fc_sampleBins :: SampleBins
50 , fc_clusters :: Clusters
51 , fc_defaultValue :: DefaultValue
52 }
53
54 filterCooc :: (Show t, Ord t) => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
55 filterCooc fc cc = (filterCooc' fc) ts cc
56 where
57 ts = map _scored_terms $ takeSome fc $ coocScored cc
58
59 filterCooc' :: (Show t, Ord t) => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
60 filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = trace ("coocScored " <> show ts) $
61 foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
62 M.empty selection
63 where
64 selection = [(x,y) | x <- ts
65 , y <- ts
66 -- , x >= y
67 ]
68
69
70 -- | Map list creation
71 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
72 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
73 -- each parts is then ordered by Inclusion/Exclusion
74 -- take n scored terms in each parts where n * SampleBins = MapListSize.
75 takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
76 takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
77 $ takeSample n m
78 $ L.take l' $ reverse $ sortWith (Down . _scored_incExc) scores
79 -- splitKmeans k scores
80 where
81 -- TODO: benchmark with accelerate-example kmeans version
82 --splitKmeans x xs = L.concat $ map elements
83 -- $ V.take (k-1)
84 -- $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
85 -- euclidSq x xs
86 n = round ((fromIntegral l)/s)
87 m = round $ (fromIntegral $ length scores) / (s)
88 takeSample n' m' xs = -- trace ("splitKmeans " <> show (length xs)) $
89 L.concat $ map (L.take n')
90 $ map (sortWith (Down . _scored_incExc))
91 -- TODO use kmeans s instead of splitEvery
92 -- in order to split in s heteregenous parts
93 -- without homogeneous order hypothesis
94 $ splitEvery m'
95 $ sortWith (Down . _scored_speGen) xs
96
97
98 data Scored ts = Scored { _scored_terms :: !ts
99 , _scored_incExc :: !InclusionExclusion
100 , _scored_speGen :: !SpecificityGenericity
101 } deriving (Show)
102
103 -- TODO in the textflow we end up needing these indices, it might be better
104 -- to compute them earlier and pass them around.
105 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
106 coocScored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
107 where
108 (ti,fi) = createIndices m
109 (is, ss) = incExcSpeGen $ cooc2mat ti m
110 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)