src/Gargantext/Text/Metrics.hs

   1 {-|
   2 Module      : Gargantext.Text.Metrics
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Mainly reexport functions in @Data.Text.Metrics@
  11
  12
  13 TODO
  14 noApax :: Ord a => Map a Occ -> Map a Occ
  15 noApax m = M.filter (>1) m
  16
  17 -}
  18
  19 {-# LANGUAGE BangPatterns      #-}
  20 {-# LANGUAGE NoImplicitPrelude #-}
  21 {-# LANGUAGE OverloadedStrings #-}
  22
  23 module Gargantext.Text.Metrics
  24   where
  25
  26 import Data.Text (Text, pack)
  27 import Data.Map (Map)
  28 import qualified Data.List as L
  29 import qualified Data.Map  as M
  30 import qualified Data.Set  as S
  31 import qualified Data.Text as T
  32 import qualified Data.Vector as V
  33 import qualified Data.Vector.Unboxed as VU
  34 import Data.Tuple.Extra (both)
  35 --import GHC.Real (Ratio)
  36 --import qualified Data.Text.Metrics as DTM
  37 import Data.Array.Accelerate (toList)
  38 import Math.KMeans (kmeans, euclidSq, elements)
  39
  40
  41 import Gargantext.Prelude
  42
  43 import Gargantext.Text.Metrics.Count (occurrences, cooc)
  44 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
  45 import Gargantext.Core (Lang(EN))
  46 import Gargantext.Core.Types (Terms(..))
  47 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
  48
  49 import Gargantext.Viz.Graph.Distances.Matrice
  50 import Gargantext.Viz.Graph.Index
  51
  52 import qualified Data.Array.Accelerate.Interpreter as DAA
  53 import qualified Data.Array.Accelerate as DAA
  54
  55 import GHC.Real (round)
  56
  57 filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
  58 filterCooc cc = filterCooc' ts cc
  59   where
  60     ts     = map _scored_terms $ takeSome 350 5 2 $ coocScored cc
  61
  62 filterCooc' :: Ord t => [t] -> Map (t, t) Int -> Map (t, t) Int
  63 filterCooc' ts m = foldl' (\m' k -> M.insert k (maybe errMessage identity $ M.lookup k m) m') M.empty selection
  64   where
  65     errMessage = panic "Filter cooc: no key"
  66     selection  = [(x,y) | x <- ts, y <- ts, x > y]
  67
  68
  69 type MapListSize = Int
  70 type SampleBins  = Double
  71 type Clusters    = Int
  72
  73 -- | Map list creation
  74 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
  75 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
  76 -- each parts is then ordered by Inclusion/Exclusion
  77 -- take n scored terms in each parts where n * SampleBins = MapListSize.
  78 takeSome :: Ord t => MapListSize -> SampleBins -> Clusters -> [Scored t] -> [Scored t]
  79 takeSome l s k scores = L.take l
  80                     $ takeSample n m
  81                     $ splitKmeans k scores
  82   where
  83     -- TODO: benchmark with accelerate-example kmeans version
  84     splitKmeans x xs = elements
  85                      $ V.head
  86                      $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
  87                               euclidSq x xs
  88     n = round ((fromIntegral l)/s)
  89     m = round $ (fromIntegral $ length scores) / (s)
  90     takeSample n m xs = L.concat $ map (L.take n)
  91                                  $ L.reverse $ map (L.sortOn _scored_incExc)
  92                                  -- TODO use kmeans s instead of splitEvery
  93                                  -- in order to split in s heteregenous parts
  94                                  -- without homogeneous order hypothesis
  95                                  $ splitEvery m
  96                                  $ L.reverse $ L.sortOn _scored_speGen xs
  97
  98
  99 data Scored t = Scored { _scored_terms  :: !t
 100                        , _scored_incExc :: !InclusionExclusion
 101                        , _scored_speGen :: !SpecificityGenericity
 102                      } deriving (Show)
 103
 104 --coocScored :: Ord t => Map (t,t) Int -> [Scored t]
 105 --coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
 106 coocScored :: (DAA.Elt t, Ord t) => Map (t,t) Int -> [Scored t]
 107 coocScored m = map (\(t,inc,spe) -> Scored t inc spe) scores
 108   where
 109     (ti,fi) = createIndices m
 110     (is, ss) = incExcSpeGen $ cooc2mat ti m
 111     scores = DAA.toList $ DAA.run $ DAA.zip3 (DAA.use ts) (DAA.use is) (DAA.use ss)
 112     ts = DAA.fromList (DAA.arrayShape is) (snd <$> M.toAscList fi)
 113     -- TODO fi should already be a Vector
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
 135 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
 136   where
 137     (ti,fi) = createIndices m
 138     ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
 139
 140
 141
 142 metrics_text :: Text
 143 metrics_text = T.intercalate " " metrics_sentences
 144
 145 metrics_sentences' :: [Text]
 146 metrics_sentences' = splitBy (Sentences 0) metrics_text
 147
 148 -- | Sentences
 149 metrics_sentences :: [Text]
 150 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
 151                     , "I can see the glass on the table."
 152                     , "There was only a spoon on that table."
 153                     , "The glass just fall from the table, pouring wine everywhere."
 154                     , "I wish the glass did not contain wine."
 155                     ]
 156
 157 metrics_sentences_Test = metrics_sentences == metrics_sentences'
 158
 159 -- | Terms reordered to visually check occurrences
 160 -- >>>
 161 {- [ [["table"],["glass"],["wine"],["spoon"]]
 162    , [["glass"],["table"]]
 163    , [["spoon"],["table"]]
 164    , [["glass"],["table"],["wine"]]
 165    , [["glass"],["wine"]]
 166    ]
 167 -}
 168
 169 metrics_terms :: IO [[Terms]]
 170 metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
 171
 172 -- | Occurrences
 173 {-
 174 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
 175          , (fromList ["object"],fromList [(["object"], 3 )])
 176          , (fromList ["glas"]  ,fromList [(["glas"]  , 2 )])
 177          , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
 178 -}
 179 metrics_occ = occurrences <$> L.concat <$> metrics_terms
 180
 181 {-
 182 -- fromList [((["glas"],["object"]),6)
 183             ,((["glas"],["spoon"]),4)
 184             ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
 185
 186 -}
 187 metrics_cooc = cooc <$> metrics_terms
 188
 189 metrics_cooc_mat = do
 190   m <- metrics_cooc
 191   let (ti,_) = createIndices m
 192   let mat_cooc = cooc2mat ti m
 193   pure ( ti
 194        , mat_cooc
 195        , incExcSpeGen_proba  mat_cooc
 196        , incExcSpeGen        mat_cooc
 197        )
 198
 199 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
 200