2 Module : Gargantext.Text.Metrics
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Mainly reexport functions in @Data.Text.Metrics@
14 {-# LANGUAGE BangPatterns #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
18 module Gargantext.Text.Metrics
21 import Data.Ord (Down(..))
22 import qualified Data.List as L
25 import qualified Data.Map as M
27 --import Math.KMeans (kmeans, euclidSq, elements)
29 import Gargantext.Prelude
30 import Gargantext.Viz.Graph.Distances.Matrice
31 import Gargantext.Viz.Graph.Index
33 import qualified Data.Array.Accelerate.Interpreter as DAA
34 import qualified Data.Array.Accelerate as DAA
35 -- import Data.Array.Accelerate ((:.)(..), Z(..))
37 import GHC.Real (round)
41 data MapListSize = MapListSize Int
42 data InclusionSize = InclusionSize Int
43 data SampleBins = SampleBins Double
44 data Clusters = Clusters Int
45 data DefaultValue = DefaultValue Int
47 data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
48 , fc_inclusionSize :: InclusionSize
49 , fc_sampleBins :: SampleBins
50 , fc_clusters :: Clusters
51 , fc_defaultValue :: DefaultValue
54 filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
55 filterCooc fc cc = (filterCooc' fc) ts cc
57 ts = map _scored_terms $ takeSome fc $ coocScored cc
59 filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
60 filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
61 foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
64 selection = [(x,y) | x <- ts
70 -- | Map list creation
71 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
72 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
73 -- each parts is then ordered by Inclusion/Exclusion
74 -- take n scored terms in each parts where n * SampleBins = MapListSize.
75 takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
76 takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
78 $ L.take l' $ sortWith (Down . _scored_incExc) scores
79 -- splitKmeans k scores
81 -- TODO: benchmark with accelerate-example kmeans version
82 --splitKmeans x xs = L.concat $ map elements
84 -- $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
86 n = round ((fromIntegral l)/s)
87 m = round $ (fromIntegral $ length scores) / (s)
88 takeSample n' m' xs = -- trace ("splitKmeans " <> show (length xs)) $
89 L.concat $ map (L.take n')
90 $ map (sortWith (Down . _scored_incExc))
91 -- TODO use kmeans s instead of splitEvery
92 -- in order to split in s heteregenous parts
93 -- without homogeneous order hypothesis
95 $ sortWith (Down . _scored_speGen) xs
98 data Scored t = Scored { _scored_terms :: !t
99 , _scored_incExc :: !InclusionExclusion
100 , _scored_speGen :: !SpecificityGenericity
103 -- TODO in the textflow we end up needing these indices, it might be better
104 -- to compute them earlier and pass them around.
105 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
106 coocScored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
108 (ti,fi) = createIndices m
109 (is, ss) = incExcSpeGen $ cooc2mat ti m
110 scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)