src/Gargantext/Text/Metrics.hs

   1 {-|
   2 Module      : Gargantext.Text.Metrics
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Mainly reexport functions in @Data.Text.Metrics@
  11
  12
  13 TODO
  14 noApax :: Ord a => Map a Occ -> Map a Occ
  15 noApax m = M.filter (>1) m
  16
  17 -}
  18
  19 {-# LANGUAGE BangPatterns      #-}
  20 {-# LANGUAGE NoImplicitPrelude #-}
  21 {-# LANGUAGE OverloadedStrings #-}
  22
  23 module Gargantext.Text.Metrics
  24   where
  25
  26 import Data.Text (Text, pack)
  27 import Data.Map (Map)
  28 import qualified Data.List as L
  29 import qualified Data.Map  as M
  30 import qualified Data.Set  as S
  31 import qualified Data.Text as T
  32 import qualified Data.Vector as V
  33 import qualified Data.Vector.Unboxed as VU
  34 import Data.Tuple.Extra (both)
  35 --import GHC.Real (Ratio)
  36 --import qualified Data.Text.Metrics as DTM
  37 import Data.Array.Accelerate (toList)
  38 import Math.KMeans (kmeans, euclidSq, elements)
  39
  40
  41 import Gargantext.Prelude
  42
  43 import Gargantext.Text.Metrics.Count (occurrences, cooc)
  44 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
  45 import Gargantext.Core (Lang(EN))
  46 import Gargantext.Core.Types (Terms(..))
  47 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
  48
  49 import Gargantext.Viz.Graph.Distances.Matrice
  50 import Gargantext.Viz.Graph.Index
  51
  52 import qualified Data.Array.Accelerate.Interpreter as DAA
  53 import qualified Data.Array.Accelerate as DAA
  54 -- import Data.Array.Accelerate ((:.)(..), Z(..))
  55
  56 import GHC.Real (round)
  57
  58 import Debug.Trace
  59 import Prelude (seq)
  60
  61 data MapListSize   = MapListSize   Int
  62 data InclusionSize = InclusionSize Int
  63 data SampleBins    = SampleBins    Double
  64 data Clusters      = Clusters      Int
  65 data DefaultValue  = DefaultValue  Int
  66
  67 data FilterConfig = FilterConfig { fc_mapListSize   :: MapListSize
  68                                  , fc_inclusionSize :: InclusionSize
  69                                  , fc_sampleBins    :: SampleBins
  70                                  , fc_clusters      :: Clusters
  71                                  , fc_defaultValue  :: DefaultValue
  72                              }
  73
  74 filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
  75 filterCooc fc cc = (filterCooc' fc) ts cc
  76   where
  77     ts     = map _scored_terms $ takeSome fc $ coocScored cc
  78
  79
  80 filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
  81 filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
  82   foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
  83     M.empty selection
  84   where
  85     selection  = [(x,y) | x <- ts
  86                         , y <- ts
  87                        -- , x >= y
  88                         ]
  89
  90
  91 -- | Map list creation
  92 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
  93 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
  94 -- each parts is then ordered by Inclusion/Exclusion
  95 -- take n scored terms in each parts where n * SampleBins = MapListSize.
  96 takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
  97 takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
  98                     $ takeSample n m
  99                     $ L.take l' $ L.reverse $ L.sortOn _scored_incExc scores
 100                     -- $ splitKmeans k scores
 101   where
 102     -- TODO: benchmark with accelerate-example kmeans version
 103     splitKmeans x xs = L.concat $ map elements
 104                      $ V.take (k-1)
 105                      $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
 106                               euclidSq x xs
 107     n = round ((fromIntegral l)/s)
 108     m = round $ (fromIntegral $ length scores) / (s)
 109     takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
 110                         L.concat $ map (L.take n)
 111                                  $ map (reverse . (L.sortOn _scored_incExc))
 112                                  -- TODO use kmeans s instead of splitEvery
 113                                  -- in order to split in s heteregenous parts
 114                                  -- without homogeneous order hypothesis
 115                                  $ splitEvery m
 116                                  $ L.reverse $ L.sortOn _scored_speGen xs
 117
 118
 119 data Scored t = Scored { _scored_terms  :: !t
 120                        , _scored_incExc :: !InclusionExclusion
 121                        , _scored_speGen :: !SpecificityGenericity
 122                      } deriving (Show)
 123
 124 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
 125 coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
 126   where
 127     (ti,fi) = createIndices m
 128     (is, ss) = incExcSpeGen $ cooc2mat ti m
 129     scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
 151 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
 152   where
 153     (ti,fi) = createIndices m
 154     ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
 155
 156
 157
 158 metrics_text :: Text
 159 metrics_text = T.intercalate " " metrics_sentences
 160
 161 metrics_sentences' :: [Text]
 162 metrics_sentences' = splitBy (Sentences 0) metrics_text
 163
 164 -- | Sentences
 165 metrics_sentences :: [Text]
 166 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
 167                     , "I can see the glass on the table."
 168                     , "There was only a spoon on that table."
 169                     , "The glass just fall from the table, pouring wine everywhere."
 170                     , "I wish the glass did not contain wine."
 171                     ]
 172
 173 metrics_sentences_Test = metrics_sentences == metrics_sentences'
 174
 175 -- | Terms reordered to visually check occurrences
 176 -- >>>
 177 {- [ [["table"],["glass"],["wine"],["spoon"]]
 178    , [["glass"],["table"]]
 179    , [["spoon"],["table"]]
 180    , [["glass"],["table"],["wine"]]
 181    , [["glass"],["wine"]]
 182    ]
 183 -}
 184
 185 metrics_terms :: IO [[Terms]]
 186 metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
 187
 188 -- | Occurrences
 189 {-
 190 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
 191          , (fromList ["object"],fromList [(["object"], 3 )])
 192          , (fromList ["glas"]  ,fromList [(["glas"]  , 2 )])
 193          , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
 194 -}
 195 metrics_occ = occurrences <$> L.concat <$> metrics_terms
 196
 197 {-
 198 -- fromList [((["glas"],["object"]),6)
 199             ,((["glas"],["spoon"]),4)
 200             ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
 201
 202 -}
 203 metrics_cooc = cooc <$> metrics_terms
 204
 205 metrics_cooc_mat = do
 206   m <- metrics_cooc
 207   let (ti,_) = createIndices m
 208   let mat_cooc = cooc2mat ti m
 209   pure ( ti
 210        , mat_cooc
 211        , incExcSpeGen_proba  mat_cooc
 212        , incExcSpeGen        mat_cooc
 213        )
 214
 215 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
 216