From 4a4ade7d23921dd160aee499500d01e5371493e1 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Alexandre=20Delano=C3=AB?= Date: Tue, 29 May 2018 17:32:34 +0200 Subject: [PATCH 01/16] [FEAT] Cooc -> Matrix conversions tools. --- package.yaml | 1 + src/Gargantext/Pipeline.hs | 25 +++- src/Gargantext/Prelude.hs | 12 ++ src/Gargantext/Text/Metrics/Occurrences.hs | 59 ++++++--- src/Gargantext/Text/Terms.hs | 2 + src/Gargantext/Text/Terms/Lists.hs | 1 - src/Gargantext/Viz/Graph/Distances.hs | 25 ++++ src/Gargantext/Viz/Graph/Distances/Matrice.hs | 21 +++- src/Gargantext/Viz/Graph/Index.hs | 114 ++++++++++++++++++ src/Gargantext/Viz/Graph/Utils.hs | 95 ++------------- stack.yaml | 3 +- 11 files changed, 241 insertions(+), 117 deletions(-) create mode 100644 src/Gargantext/Viz/Graph/Distances.hs create mode 100644 src/Gargantext/Viz/Graph/Index.hs diff --git a/package.yaml b/package.yaml index e7884f03..2de9e6a0 100644 --- a/package.yaml +++ b/package.yaml @@ -35,6 +35,7 @@ library: dependencies: - QuickCheck - accelerate + - accelerate-io - aeson - aeson-lens - aeson-pretty diff --git a/src/Gargantext/Pipeline.hs b/src/Gargantext/Pipeline.hs index f5b52066..ba74e8ee 100644 --- a/src/Gargantext/Pipeline.hs +++ b/src/Gargantext/Pipeline.hs @@ -10,29 +10,44 @@ Portability : POSIX -} {-# OPTIONS_GHC -fno-warn-name-shadowing #-} - -{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE NoImplicitPrelude #-} module Gargantext.Pipeline where +import Data.Text (unpack) +import qualified Data.Text as DT import Data.Text.IO (readFile) +---------------------------------------------- +---------------------------------------------- + import Gargantext.Core +import Gargantext.Core.Types import Gargantext.Prelude +import Gargantext.Viz.Graph.Index (map', createIndexes) +import Gargantext.Viz.Graph.Distances.Matrice (distributional, int2double) import Gargantext.Text.Metrics.Occurrences import Gargantext.Text.Terms import Gargantext.Text.Context +import Data.Array.Accelerate as A pipeline pth = do text <- readFile pth let contexts = splitBy Sentences 4 text - cooc <$> map occurrences <$> mapM (terms Mono FR) contexts - -- todo + myterms <- mapM (terms Multi FR) contexts + -- todo filter stop words + let myCooc = removeApax $ cooc myterms + --pure myCooc -- Cooc map -> Matrix - -- distributional or conditional + --pure $ createIndexes myCooc + pure $ map' int2double myCooc -- Matrix -> Graph + + + + diff --git a/src/Gargantext/Prelude.hs b/src/Gargantext/Prelude.hs index 29083ec9..4cad2f18 100644 --- a/src/Gargantext/Prelude.hs +++ b/src/Gargantext/Prelude.hs @@ -49,6 +49,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer , otherwise, when , undefined , IO() + , compare ) -- TODO import functions optimized in Utils.Count @@ -106,6 +107,12 @@ movingAverage steps xs = map mean $ chunkAlong steps 1 xs ma :: [Double] -> [Double] ma = movingAverage 3 +-- | splitEvery n == chunkAlong n n +splitEvery :: Int -> [a] -> [[a]] +splitEvery _ [] = L.cycle [[]] +splitEvery n xs = + let (h,t) = L.splitAt n xs + in h : splitEvery n t -- | Function to split a range into chunks chunkAlong :: Int -> Int -> [a] -> [[a]] @@ -227,3 +234,8 @@ zipSnd f xs = zip xs (f xs) unMaybe :: [Maybe a] -> [a] unMaybe = map fromJust . L.filter isJust +-- maximumWith +maximumWith f = L.maximumBy (\x y -> compare (f x) (f y)) + + + diff --git a/src/Gargantext/Text/Metrics/Occurrences.hs b/src/Gargantext/Text/Metrics/Occurrences.hs index d4e24b33..00a0f429 100644 --- a/src/Gargantext/Text/Metrics/Occurrences.hs +++ b/src/Gargantext/Text/Metrics/Occurrences.hs @@ -29,12 +29,17 @@ module Gargantext.Text.Metrics.Occurrences where +import Control.Arrow ((***)) +import qualified Data.List as List import Data.Map.Strict (Map - , empty + , empty, singleton , insertWith, insertWithKey, unionWith - , toList + , toList, lookup, mapKeys ) import Data.Set (Set) +import qualified Data.Set as Set +import Data.Text (pack) + import qualified Data.Map.Strict as DMS import Control.Monad ((>>),(>>=)) import Data.String (String()) @@ -53,6 +58,7 @@ data Group = ByStem | ByOntology type Grouped = Stems +{- -- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"] -- >> map occurrences <$> Prelude.mapM (terms Mono EN) -- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]] @@ -66,32 +72,45 @@ type Grouped = Stems --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)] ---- -cooc :: (Ord b, Num a) => [Map b a] -> Map (b, b) a -cooc ts = cooc' $ map cooc'' ts + -} + +type Occs = Int +type Coocs = Int -cooc' :: (Ord b, Num a) => [Map (b, b) a] -> Map (b,b) a -cooc' = foldl' (\x y -> unionWith (+) x y) empty +removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int +removeApax = DMS.filter (> 1) -cooc'' :: (Ord b, Num a) => Map b a -> Map (b, b) a -cooc'' m = foldl' (\x (y,c) -> insertWith (+) y c x) empty xs +cooc :: [[Terms]] -> Map (Label, Label) Int +cooc tss = + mapKeys (delta $ labelPolicy terms_occs) $ cooc' (map (Set.fromList . map _terms_stem) tss) where - xs =[ ((x'',y''), c') | x' <- toList m - , y' <- toList m - , let x'' = fst x' - , let y'' = fst y' - , x'' < y'' - , let c' = 1 - --, let c' = snd x' + snd y' - ] + terms_occs = occurrences (List.concat tss) + delta f = f *** f + + +labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label +labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of + Just label -> label + Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g) + +cooc' :: Ord b => [Set b] -> Map (b, b) Coocs +cooc' tss = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs + where + xs = [ ((x, y), 1) + | xs <- tss + , ys <- tss + , x <- Set.toList xs + , y <- Set.toList ys + , x < y + ] -- | Compute the grouped occurrences (occ) -occurrences :: [Terms] -> Map Grouped Int +occurrences :: [Terms] -> Map Grouped (Map Terms Int) occurrences = occurrences' _terms_stem -occurrences' :: Ord b => (a -> b) -> [a] -> Occ b -occurrences' f xs = foldl' (\x y -> insertWith (+) (f y) 1 x) empty xs - +occurrences' :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int) +occurrences' f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty -- TODO add groups and filter stops sumOcc :: Ord a => [Occ a] -> Occ a diff --git a/src/Gargantext/Text/Terms.hs b/src/Gargantext/Text/Terms.hs index 68dbb067..f19496f2 100644 --- a/src/Gargantext/Text/Terms.hs +++ b/src/Gargantext/Text/Terms.hs @@ -43,6 +43,8 @@ import Gargantext.Text.Terms.Mono (monoterms') data TermType = Mono | Multi +-- remove Stop Words +-- map (filter (\t -> not . elem t)) $ ------------------------------------------------------------------------ terms :: TermType -> Lang -> Text -> IO [Terms] terms Mono lang txt = pure $ monoterms' lang txt diff --git a/src/Gargantext/Text/Terms/Lists.hs b/src/Gargantext/Text/Terms/Lists.hs index bc7c1e4d..9b4f10ce 100644 --- a/src/Gargantext/Text/Terms/Lists.hs +++ b/src/Gargantext/Text/Terms/Lists.hs @@ -35,4 +35,3 @@ data ListName = Stop | Candidate | Graph --stop :: [Ngrams] -> [Ngrams] --stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs - diff --git a/src/Gargantext/Viz/Graph/Distances.hs b/src/Gargantext/Viz/Graph/Distances.hs new file mode 100644 index 00000000..b10d356e --- /dev/null +++ b/src/Gargantext/Viz/Graph/Distances.hs @@ -0,0 +1,25 @@ +{-| +Module : Gargantext.Graph.Distances +Description : Distance management tools +Copyright : (c) CNRS, 2017-Present +License : AGPL + CECILL v3 +Maintainer : team@gargantext.org +Stability : experimental +Portability : POSIX + +-} + +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE Strict #-} + +module Gargantext.Viz.Graph.Distances + where + + + + + + + + diff --git a/src/Gargantext/Viz/Graph/Distances/Matrice.hs b/src/Gargantext/Viz/Graph/Distances/Matrice.hs index f0d03eeb..546dedc8 100644 --- a/src/Gargantext/Viz/Graph/Distances/Matrice.hs +++ b/src/Gargantext/Viz/Graph/Distances/Matrice.hs @@ -48,6 +48,13 @@ import Data.Maybe (Maybe(Just)) import qualified Gargantext.Prelude as P import qualified Data.Array.Accelerate.Array.Representation as Repr +import Gargantext.Text.Metrics.Occurrences + + +----------------------------------------------------------------------- +-- Test perf. +distriTest = distributional $ myMat 100 +----------------------------------------------------------------------- vector :: Int -> (Array (Z :. Int) Int) vector n = fromList (Z :. n) [0..n] @@ -55,14 +62,14 @@ vector n = fromList (Z :. n) [0..n] matrix :: Elt c => Int -> [c] -> Matrix c matrix n l = fromList (Z :. n :. n) l -myMat :: Int -> Matrix Double +myMat :: Int -> Matrix Int myMat n = matrix n [1..] -- | Two ways to get the rank (as documentation) -rank :: (Matrix Double) -> Int +rank :: (Matrix a) -> Int rank m = arrayRank $ arrayShape m -rank' :: (Matrix Double) -> Int +rank' :: (Matrix a) -> Int rank' m = n where Z :. _ :. n = arrayShape m @@ -109,8 +116,8 @@ conditional m = (run $ ie (use m), run $ sg (use m)) -- | Distributional Distance -distributional :: Matrix Double -> Matrix Double -distributional m = run $ filter $ ri (use m) +distributional :: Matrix Int -> Matrix Double +distributional m = run $ filter $ ri (map fromIntegral $ use m) where n = rank' m @@ -131,3 +138,7 @@ distributional m = run $ filter $ ri (use m) crossProduct m = zipWith (*) (cross m ) (cross (transpose m)) cross mat = zipWith (-) (mkSum n mat) (mat) + +int2double :: Matrix Int -> Matrix Double +int2double m = run (map fromIntegral $ use m) + diff --git a/src/Gargantext/Viz/Graph/Index.hs b/src/Gargantext/Viz/Graph/Index.hs new file mode 100644 index 00000000..5fe0d3c7 --- /dev/null +++ b/src/Gargantext/Viz/Graph/Index.hs @@ -0,0 +1,114 @@ +{-| +Module : Gargantext.Graph.Distances.Utils +Description : Tools to compute distances from Cooccurrences +Copyright : (c) CNRS, 2017-Present +License : AGPL + CECILL v3 +Maintainer : team@gargantext.org +Stability : experimental +Portability : POSIX + +Basically @compute@ takes an accelerate function as first input, a Map +of coccurrences as second input and outputs a Map automatically using +indexes. + +TODO: +--cooc2fgl :: Ord t, Integral n => Map (t, t) n -> Graph +--fgl2json + +-} + +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE TypeOperators #-} + + +module Gargantext.Viz.Graph.Index + where + +import qualified Data.Array.Accelerate as A +import qualified Data.Array.Accelerate.IO.Data.Vector.Unboxed as AU + +import qualified Data.Vector.Unboxed as DVU +import Data.List (concat) + +import Data.Set (Set) +import qualified Data.Set as S + +import Data.Map (Map) +import qualified Data.Map.Strict as M + +import Gargantext.Prelude + +type Index = Int + + +------------------------------------------------------------------------------- +{- +map'' :: (Ord t) => (A.Matrix Int -> A.Matrix Double) + -> Map (t, t) Int + -> Map (t, t) Double +map'' f m = back . f' . from m + where + from (fs, m') = unzip $ M.toAscList m + f' = f $ A.fromList shape m' + shape = (A.Z A.:. n A.:. n) + back = M.fromAscList . zip fs . A.toList +-} +------------------------------------------------------------------------------- +map' :: (Ord t) => (A.Matrix Int -> A.Matrix Double) + -> Map (t, t) Int + -> Map (t, t) Double +map' f m = fromIndex fromI . mat2cooc . f $ cooc2mat toI m + where + (toI, fromI) = createIndexes m + +map'' m = cooc2mat toI m + where + (toI, fromI) = createIndexes m + +------------------------------------------------------------------------------- +------------------------------------------------------------------------------- +cooc2mat :: Ord t => Map t Index -> Map (t, t) Int -> A.Matrix Int +cooc2mat ti m = A.fromFunction shape (\(A.Z A.:. x A.:. y) -> lookup' x y) + where + shape = (A.Z A.:. n A.:. n) + n = M.size ti + lookup' x y = maybe 0 identity (M.lookup (x,y) (toIndex ti m)) + +mat2cooc :: A.Matrix Double -> Map (Index, Index) Double +mat2cooc m = M.fromList $ concat -- [((Int,Int), Double)] + $ map (\(x,xs) -> map (\(y,ys) -> ((x,y),ys)) xs) -- [[((Int,Int), Double)]] + $ zip ([1..] :: [Int]) -- [(Int, [(Int, Double)]] + $ map (zip ([1..] :: [Int])) -- [[(Int, Double)]] + $ splitEvery n (A.toList m) -- [[Double]] + where + A.Z A.:. _ A.:. n = A.arrayShape m + +------------------------------------------------------------------------------- +------------------------------------------------------------------------------- +toIndex :: Ord t => Map t Index -> Map (t,t) a -> Map (Index,Index) a +toIndex ni ns = indexConversion ni ns + +fromIndex :: Ord t => Map Index t -> Map (Index, Index) a -> Map (t,t) a +fromIndex ni ns = indexConversion ni ns +--------------------------------------------------------------------------------- +indexConversion :: (Ord b, Ord k) => Map k b -> Map (k,k) a -> Map (b, b) a +indexConversion index ms = M.fromList $ map (\((k1,k2),c) -> ( ((M.!) index k1, (M.!) index k2), c)) (M.toList ms) +------------------------------------------------------------------------------- +------------------------------------------------------------------------------- +createIndexes :: Ord t => Map (t, t) b -> (Map t Index, Map Index t) +createIndexes = set2indexes . cooc2set + where + cooc2set :: Ord t => Map (t, t) a -> Set t + cooc2set cs' = foldl' (\s ((t1,t2),_) -> insert [t1,t2] s ) S.empty (M.toList cs') + where + insert as s = foldl' (\s' t -> S.insert t s') s as + + set2indexes :: Ord t => Set t -> (Map t Index, Map Index t) + set2indexes s = (M.fromList toIndex', M.fromList fromIndex') + where + fromIndex' = zip [1..] (S.toList s) + toIndex' = zip (S.toList s) [1..] + + diff --git a/src/Gargantext/Viz/Graph/Utils.hs b/src/Gargantext/Viz/Graph/Utils.hs index 04af0f5c..f91a4b62 100644 --- a/src/Gargantext/Viz/Graph/Utils.hs +++ b/src/Gargantext/Viz/Graph/Utils.hs @@ -7,6 +7,8 @@ Maintainer : team@gargantext.org Stability : experimental Portability : POSIX +These functions are used for Vector.Matrix only. + -} {-# LANGUAGE BangPatterns #-} @@ -34,65 +36,16 @@ import Gargantext.Prelude ------------------------------------------------------------------------ -- | Some utils to build the matrix from cooccurrence results -type Distance = Double -type Cooc = Int -type NgramId = Int -type Index = Int - --- Type Families ---type Matrix' Index a ---type Matrix' NgramId a - -data Matrice a = Matrice { matrice_fromIndex :: !(Map Index NgramId) - , matrice_toIndex :: !(Map NgramId Index) - , matrice :: !(Matrix a) - } deriving (Show) - ---fromMatrice :: Matrice Double -> [(NgramId, NgramId, Double)] ---fromMatrice m = undefined - - -toMatrice :: [(NgramId, NgramId, Int)] -> Matrice Double -toMatrice ns = Matrice fromIndx toIndx m - where - s = cooc2set ns - (fromIndx, toIndx) = set2indexes s - n = (length (S.toList s)) - idx = toIndex toIndx ns - m = matrix n n (\x -> maybe 0 identity (fromIntegral <$> M.lookup x idx)) - -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- -toIndex :: Map NgramId Index -> [(NgramId, NgramId, a)] -> Map (Index,Index) a -toIndex ni ns = to ni ns - -fromIndex :: Map Index NgramId -> [(Index, Index, a)] -> Map (NgramId,NgramId) a -fromIndex ni ns = to ni ns -------------------------------------------------------------------------------- -to :: (Ord b, Ord k) => Map k b -> [(k, k, a)] -> Map (b, b) a -to index ns = M.fromList $ map (\(a1,a2,c) -> ( ( (M.!) index a1 - , (M.!) index a2 - ) - , c - ) - ) ns - -------------------------------------------------------------------------------- -cooc2set :: [(NgramId, NgramId, a)] -> Set NgramId -cooc2set cs' = foldl' (\s (a1,a2,_) -> insert [a1,a2] s ) S.empty cs' - where - insert as s = foldl' (\s' a -> S.insert a s') s as - - -set2indexes :: Set NgramId -> (Map Index NgramId, Map NgramId Index) -set2indexes s = (M.fromList fromIndex', M.fromList toIndex') - where - s' = S.toList s - fromIndex' = zip [1..] s' - toIndex' = zip s' [1..] - - +-- | For tests only, to be removed +-- m1 :: Matrix Double +-- m1 = fromList 300 300 [1..] +------------------------------------------------------------------------ ------------------------------------------------------------------------ +data Axis = Col | Row +------------------------------------------------------------------------ +-- | Matrix functions +type AxisId = Int + -- Data.Vector.Additions dropAt :: Int -> Vector a -> Vector a dropAt n v = debut <> (V.tail fin) @@ -100,28 +53,6 @@ dropAt n v = debut <> (V.tail fin) debut = V.take n v fin = V.drop n v ------------------------------------------------------------------------- -data Axis = Col | Row ----- | Matrix Algebra ---data Algebra a = Point a | Vector a | Matrix a --- ---multiply :: Algebra a -> Matrix a -> Matrix a ---multiply (Point a) = undefined ---multiply (Vector a) = undefined ---multiply (Matrix a) = undefined --- ---div :: Fractional a => Matrix a -> Matrix a ---div m = foldl' (\m c -> divCol c m) m [1.. (ncols m)] --- where --- divCol c m = mapCol (\_ x -> 1/x) c m --- ---divide :: Fractional a => Matrix a -> Matrix a -> Matrix a ---divide a b = a `multStd` (div b) - ------------------------------------------------------------------------- --- | Matrix functions -type AxisId = Int - total :: Num a => Matrix a -> a total m = V.sum $ V.map (\c -> V.sum (getCol c m)) (V.enumFromTo 1 (nOf Col m)) @@ -141,7 +72,3 @@ toListsWithIndex m = concat' $ zip [1..] $ map (\c -> zip [1..] c) $ toLists m concat' xs = L.concat $ map (\(x, ys) -> map (\(y, a) -> ((x,y), a)) ys ) xs --- | For tests only, to be removed -m1 :: Matrix Double -m1 = fromList 300 300 [1..] - diff --git a/stack.yaml b/stack.yaml index bc83197c..23363269 100644 --- a/stack.yaml +++ b/stack.yaml @@ -13,9 +13,8 @@ extra-deps: commit: 6f0595d2421005837d59151a8b26eee83ebb67b5 - git: https://github.com/delanoe/servant-static-th.git commit: fff77e79fe94d563ab5cae2609b78c17b5c1f434 - #- git: https://github.com/delanoe/accelerate.git - #commit: 007fd483a4410441fb5dd1b689a5f7dab66d27ad - accelerate-1.2.0.0 +- accelerate-io-1.2.0.0 - aeson-1.2.4.0 - aeson-lens-0.5.0.0 - duckling-0.1.3.0 -- 2.47.0 From 8d237e3cc3b4fa9718af65f45cbb063aa1301bdf Mon Sep 17 00:00:00 2001 From: Nicolas Pouillard Date: Wed, 30 May 2018 12:17:47 +0200 Subject: [PATCH 02/16] Fix mat2cooc --- src/Gargantext/Viz/Graph/Index.hs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Gargantext/Viz/Graph/Index.hs b/src/Gargantext/Viz/Graph/Index.hs index 5fe0d3c7..9a378a20 100644 --- a/src/Gargantext/Viz/Graph/Index.hs +++ b/src/Gargantext/Viz/Graph/Index.hs @@ -27,7 +27,8 @@ module Gargantext.Viz.Graph.Index where import qualified Data.Array.Accelerate as A -import qualified Data.Array.Accelerate.IO.Data.Vector.Unboxed as AU +import qualified Data.Array.Accelerate.Interpreter as A +import Data.Array.Accelerate (Matrix, Elt, Shape, (:.)(..), Z(..)) import qualified Data.Vector.Unboxed as DVU import Data.List (concat) @@ -69,21 +70,20 @@ map'' m = cooc2mat toI m ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- -cooc2mat :: Ord t => Map t Index -> Map (t, t) Int -> A.Matrix Int -cooc2mat ti m = A.fromFunction shape (\(A.Z A.:. x A.:. y) -> lookup' x y) +cooc2mat :: Ord t => Map t Index -> Map (t, t) Int -> Matrix Int +cooc2mat ti m = A.fromFunction shape (\(Z :. x :. y) -> lookup' x y) where shape = (A.Z A.:. n A.:. n) n = M.size ti lookup' x y = maybe 0 identity (M.lookup (x,y) (toIndex ti m)) -mat2cooc :: A.Matrix Double -> Map (Index, Index) Double -mat2cooc m = M.fromList $ concat -- [((Int,Int), Double)] - $ map (\(x,xs) -> map (\(y,ys) -> ((x,y),ys)) xs) -- [[((Int,Int), Double)]] - $ zip ([1..] :: [Int]) -- [(Int, [(Int, Double)]] - $ map (zip ([1..] :: [Int])) -- [[(Int, Double)]] - $ splitEvery n (A.toList m) -- [[Double]] +-- TODO rename mat2map +mat2cooc :: (Elt a, Shape (Z :. Index)) => + A.Array (Z :. Index :. Index) a -> Map (Index, Index) a +mat2cooc m = M.fromList . map f . A.toList . A.run . A.indexed $ A.use m where - A.Z A.:. _ A.:. n = A.arrayShape m + Z :. _ :. n = A.arrayShape m + f ((Z :. i :. j), x) = ((i, j), x) ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- -- 2.47.0 From 30757cfc6b1b8587a28d244b04ad87176cdc21b8 Mon Sep 17 00:00:00 2001 From: Nicolas Pouillard Date: Wed, 30 May 2018 14:33:25 +0200 Subject: [PATCH 03/16] Add map2mat --- src/Gargantext/Pipeline.hs | 4 ---- src/Gargantext/Viz/Graph/Index.hs | 10 ++++++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/Gargantext/Pipeline.hs b/src/Gargantext/Pipeline.hs index ba74e8ee..58004b41 100644 --- a/src/Gargantext/Pipeline.hs +++ b/src/Gargantext/Pipeline.hs @@ -47,7 +47,3 @@ pipeline pth = do pure $ map' int2double myCooc -- Matrix -> Graph - - - - diff --git a/src/Gargantext/Viz/Graph/Index.hs b/src/Gargantext/Viz/Graph/Index.hs index 9a378a20..550a927a 100644 --- a/src/Gargantext/Viz/Graph/Index.hs +++ b/src/Gargantext/Viz/Graph/Index.hs @@ -31,7 +31,7 @@ import qualified Data.Array.Accelerate.Interpreter as A import Data.Array.Accelerate (Matrix, Elt, Shape, (:.)(..), Z(..)) import qualified Data.Vector.Unboxed as DVU -import Data.List (concat) +import Data.Maybe (fromMaybe) import Data.Set (Set) import qualified Data.Set as S @@ -75,7 +75,13 @@ cooc2mat ti m = A.fromFunction shape (\(Z :. x :. y) -> lookup' x y) where shape = (A.Z A.:. n A.:. n) n = M.size ti - lookup' x y = maybe 0 identity (M.lookup (x,y) (toIndex ti m)) + idx = toIndex ti m -- it is important to make sure that toIndex is ran only once. + lookup' x y = fromMaybe 0 $ M.lookup (x,y) idx + +map2mat :: Elt a => a -> Int -> Map (Index, Index) a -> Matrix a +map2mat def n m = A.fromFunction shape (\(Z :. x :. y) -> fromMaybe def $ M.lookup (x, y) m) + where + shape = (Z :. n :. n) -- TODO rename mat2map mat2cooc :: (Elt a, Shape (Z :. Index)) => -- 2.47.0 From 9a6c89465488d8f0889da106dba79b6dcdf36a98 Mon Sep 17 00:00:00 2001 From: Nicolas Pouillard Date: Wed, 30 May 2018 14:35:30 +0200 Subject: [PATCH 04/16] New cooc2mat --- src/Gargantext/Viz/Graph/Index.hs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Gargantext/Viz/Graph/Index.hs b/src/Gargantext/Viz/Graph/Index.hs index 550a927a..806157cf 100644 --- a/src/Gargantext/Viz/Graph/Index.hs +++ b/src/Gargantext/Viz/Graph/Index.hs @@ -71,12 +71,10 @@ map'' m = cooc2mat toI m ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- cooc2mat :: Ord t => Map t Index -> Map (t, t) Int -> Matrix Int -cooc2mat ti m = A.fromFunction shape (\(Z :. x :. y) -> lookup' x y) +cooc2mat ti m = map2mat 0 n idx where - shape = (A.Z A.:. n A.:. n) n = M.size ti idx = toIndex ti m -- it is important to make sure that toIndex is ran only once. - lookup' x y = fromMaybe 0 $ M.lookup (x,y) idx map2mat :: Elt a => a -> Int -> Map (Index, Index) a -> Matrix a map2mat def n m = A.fromFunction shape (\(Z :. x :. y) -> fromMaybe def $ M.lookup (x, y) m) -- 2.47.0 From 4b81f9d11b3fda2832fb469ed81b2efa4b7eecae Mon Sep 17 00:00:00 2001 From: Nicolas Pouillard Date: Wed, 30 May 2018 14:56:41 +0200 Subject: [PATCH 05/16] small change in createIndexes --- src/Gargantext/Viz/Graph/Index.hs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Gargantext/Viz/Graph/Index.hs b/src/Gargantext/Viz/Graph/Index.hs index 806157cf..1721af04 100644 --- a/src/Gargantext/Viz/Graph/Index.hs +++ b/src/Gargantext/Viz/Graph/Index.hs @@ -112,7 +112,8 @@ createIndexes = set2indexes . cooc2set set2indexes :: Ord t => Set t -> (Map t Index, Map Index t) set2indexes s = (M.fromList toIndex', M.fromList fromIndex') where - fromIndex' = zip [1..] (S.toList s) - toIndex' = zip (S.toList s) [1..] + fromIndex' = zip [0..] xs + toIndex' = zip xs [0..] + xs = S.toList s -- 2.47.0 From 0be01d72c7eca85a8b77c6c447a06d84d7732c74 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Alexandre=20Delano=C3=AB?= Date: Thu, 31 May 2018 01:04:07 +0200 Subject: [PATCH 06/16] [Pipeline] ok until clustering. --- src/Gargantext/Pipeline.hs | 30 +++++++---------- src/Gargantext/Text/Context.hs | 32 ++++++++++++------- src/Gargantext/Text/Parsers/CSV.hs | 12 +++---- src/Gargantext/Text/Terms.hs | 4 +++ src/Gargantext/Viz/Graph/Distances/Matrice.hs | 24 +++++++------- src/Gargantext/Viz/Graph/Index.hs | 25 +++------------ 6 files changed, 58 insertions(+), 69 deletions(-) diff --git a/src/Gargantext/Pipeline.hs b/src/Gargantext/Pipeline.hs index 58004b41..fc341f3a 100644 --- a/src/Gargantext/Pipeline.hs +++ b/src/Gargantext/Pipeline.hs @@ -15,35 +15,29 @@ Portability : POSIX module Gargantext.Pipeline where -import Data.Text (unpack) -import qualified Data.Text as DT - import Data.Text.IO (readFile) ---------------------------------------------- ---------------------------------------------- - import Gargantext.Core -import Gargantext.Core.Types import Gargantext.Prelude -import Gargantext.Viz.Graph.Index (map', createIndexes) -import Gargantext.Viz.Graph.Distances.Matrice (distributional, int2double) +import Gargantext.Viz.Graph.Index (score) +import Gargantext.Viz.Graph.Distances.Matrice (distributional) import Gargantext.Text.Metrics.Occurrences import Gargantext.Text.Terms import Gargantext.Text.Context -import Data.Array.Accelerate as A -pipeline pth = do - text <- readFile pth - let contexts = splitBy Sentences 4 text - myterms <- mapM (terms Multi FR) contexts - -- todo filter stop words +pipeline path = do + -- Text <- IO Text <- FilePath + text <- readFile path + let contexts = splitBy (Sentences 3) text + myterms <- extractTerms Multi FR contexts + -- TODO filter (\t -> not . elem t stopList) myterms + -- TODO groupBy (Stem | GroupList) let myCooc = removeApax $ cooc myterms - --pure myCooc - -- Cooc map -> Matrix - --pure $ createIndexes myCooc - pure $ map' int2double myCooc - -- Matrix -> Graph + -- Cooc -> Matrix + pure $ score distributional myCooc + -- Matrix -> Clustering -> Graph -> JSON diff --git a/src/Gargantext/Text/Context.hs b/src/Gargantext/Text/Context.hs index 6935f24d..bd63baa5 100644 --- a/src/Gargantext/Text/Context.hs +++ b/src/Gargantext/Text/Context.hs @@ -14,26 +14,34 @@ Context of text management tool {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE OverloadedStrings #-} -module Gargantext.Text.Context where +module Gargantext.Text.Context + where import Data.Text (Text, pack, unpack, length) import Data.String (IsString) -import Text.HTML.TagSoup +import Text.HTML.TagSoup (parseTags, isTagText, Tag(..)) import Gargantext.Text import Gargantext.Prelude hiding (length) -data SplitBy = Paragraph | Sentences | Chars - -splitBy :: SplitBy -> Int -> Text -> [Text] -splitBy Chars n = map pack . chunkAlong n n . unpack -splitBy Sentences n = map unsentences . chunkAlong n n . sentences -splitBy Paragraph _ = map removeTag . filter isTagText . parseTags +data SplitContext = Chars Int | Sentences Int | Paragraphs Int + +tag = parseTags +-- | splitBy contexts of Chars or Sentences or Paragraphs +-- >> splitBy (Chars 0) "abcde" +-- ["a","b","c","d","e"] +-- >> splitBy (Chars 1) "abcde" +-- ["ab","bc","cd","de"] +-- >> splitBy (Chars 2) "abcde" +-- ["abc","bcd","cde"] +splitBy :: SplitContext -> Text -> [Text] +splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack +splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences +splitBy (Paragraphs _) = map unTag . filter isTagText . tag where - removeTag :: IsString p => Tag p -> p - removeTag (TagText x) = x - removeTag (TagComment x) = x - removeTag _ = "" + unTag :: IsString p => Tag p -> p + unTag (TagText x) = x + unTag _ = "" diff --git a/src/Gargantext/Text/Parsers/CSV.hs b/src/Gargantext/Text/Parsers/CSV.hs index d3d0c39f..0ae33a29 100644 --- a/src/Gargantext/Text/Parsers/CSV.hs +++ b/src/Gargantext/Text/Parsers/CSV.hs @@ -50,13 +50,13 @@ data Doc = Doc deriving (Show) --------------------------------------------------------------- toDocs :: Vector CsvDoc -> [Doc] -toDocs v = V.toList +toDocs v = V.toList $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth) -> Doc nId t s py pm pd abst auth ) (V.enumFromN 1 (V.length v'')) v'' where v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps - seps= (V.fromList [Paragraph, Sentences, Chars]) + seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3]) --------------------------------------------------------------- fromDocs :: Vector Doc -> Vector CsvDoc @@ -69,7 +69,7 @@ fromDocs docs = V.map fromDocs' docs -- TODO adapt the size of the paragraph according to the corpus average -splitDoc :: Mean -> SplitBy -> CsvDoc -> Vector CsvDoc +splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc splitDoc m splt doc = let docSize = (length $ c_abstract doc) in if docSize > 1000 then @@ -82,15 +82,15 @@ splitDoc m splt doc = let docSize = (length $ c_abstract doc) in V.fromList [doc] -splitDoc' :: SplitBy -> CsvDoc -> Vector CsvDoc -splitDoc' splt (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs +splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc +splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs where firstDoc = CsvDoc t s py pm pd firstAbstract auth firstAbstract = head' abstracts nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts) - abstracts = (splitBy splt 20) abst + abstracts = (splitBy $ contextSize) abst head' x = maybe "" identity (head x) tail' x = maybe [""] identity (tailMay x) diff --git a/src/Gargantext/Text/Terms.hs b/src/Gargantext/Text/Terms.hs index f19496f2..112de26d 100644 --- a/src/Gargantext/Text/Terms.hs +++ b/src/Gargantext/Text/Terms.hs @@ -34,6 +34,7 @@ module Gargantext.Text.Terms where import Data.Text (Text) +import Data.Traversable import Gargantext.Prelude import Gargantext.Core @@ -46,6 +47,9 @@ data TermType = Mono | Multi -- remove Stop Words -- map (filter (\t -> not . elem t)) $ ------------------------------------------------------------------------ +extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms]) +extractTerms termType lang = mapM (terms termType lang) +------------------------------------------------------------------------ terms :: TermType -> Lang -> Text -> IO [Terms] terms Mono lang txt = pure $ monoterms' lang txt terms Multi lang txt = multiterms lang txt diff --git a/src/Gargantext/Viz/Graph/Distances/Matrice.hs b/src/Gargantext/Viz/Graph/Distances/Matrice.hs index 546dedc8..f5b26298 100644 --- a/src/Gargantext/Viz/Graph/Distances/Matrice.hs +++ b/src/Gargantext/Viz/Graph/Distances/Matrice.hs @@ -36,10 +36,8 @@ Implementation use Accelerate library : module Gargantext.Viz.Graph.Distances.Matrice where ---import Data.Array.Accelerate.Data.Bits -import Data.Array.Accelerate.Interpreter (run) - import Data.Array.Accelerate +import Data.Array.Accelerate.Interpreter (run) import Data.Array.Accelerate.Smart import Data.Array.Accelerate.Type import Data.Array.Accelerate.Array.Sugar (fromArr, Array, Z) @@ -94,14 +92,7 @@ type SpecificityGenericity = Double conditional :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity) conditional m = (run $ ie (use m), run $ sg (use m)) where - r :: Rank - r = rank' m - xs :: Matrix' Double -> Matrix' Double - xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat) - ys :: Acc (Matrix Double) -> Acc (Matrix Double) - ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat) - ie :: Matrix' Double -> Matrix' Double ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat) sg :: Acc (Matrix Double) -> Acc (Matrix Double) @@ -109,7 +100,14 @@ conditional m = (run $ ie (use m), run $ sg (use m)) n :: Exp Double n = P.fromIntegral r - + + r :: Rank + r = rank' m + + xs :: Matrix' Double -> Matrix' Double + xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat) + ys :: Acc (Matrix Double) -> Acc (Matrix Double) + ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat) -- filter with threshold ----------------------------------------------------------------------- @@ -121,7 +119,9 @@ distributional m = run $ filter $ ri (map fromIntegral $ use m) where n = rank' m - miniMax m = map (\x -> ifThenElse (x > (the $ minimum $ maximum m)) x 0) m + miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m + where + miniMax' = (the $ minimum $ maximum m) filter m = zipWith (\a b -> max a b) m (transpose m) diff --git a/src/Gargantext/Viz/Graph/Index.hs b/src/Gargantext/Viz/Graph/Index.hs index 1721af04..3ccf4a46 100644 --- a/src/Gargantext/Viz/Graph/Index.hs +++ b/src/Gargantext/Viz/Graph/Index.hs @@ -43,28 +43,12 @@ import Gargantext.Prelude type Index = Int - ------------------------------------------------------------------------------- -{- -map'' :: (Ord t) => (A.Matrix Int -> A.Matrix Double) - -> Map (t, t) Int - -> Map (t, t) Double -map'' f m = back . f' . from m - where - from (fs, m') = unzip $ M.toAscList m - f' = f $ A.fromList shape m' - shape = (A.Z A.:. n A.:. n) - back = M.fromAscList . zip fs . A.toList --} ------------------------------------------------------------------------------- -map' :: (Ord t) => (A.Matrix Int -> A.Matrix Double) +score :: (Ord t) => (A.Matrix Int -> A.Matrix Double) -> Map (t, t) Int -> Map (t, t) Double -map' f m = fromIndex fromI . mat2cooc . f $ cooc2mat toI m - where - (toI, fromI) = createIndexes m - -map'' m = cooc2mat toI m +score f m = fromIndex fromI . mat2map . f $ cooc2mat toI m where (toI, fromI) = createIndexes m @@ -81,10 +65,9 @@ map2mat def n m = A.fromFunction shape (\(Z :. x :. y) -> fromMaybe def $ M.look where shape = (Z :. n :. n) --- TODO rename mat2map -mat2cooc :: (Elt a, Shape (Z :. Index)) => +mat2map :: (Elt a, Shape (Z :. Index)) => A.Array (Z :. Index :. Index) a -> Map (Index, Index) a -mat2cooc m = M.fromList . map f . A.toList . A.run . A.indexed $ A.use m +mat2map m = M.fromList . map f . A.toList . A.run . A.indexed $ A.use m where Z :. _ :. n = A.arrayShape m f ((Z :. i :. j), x) = ((i, j), x) -- 2.47.0 From b76fc48993fdf631b3725468745ac8baccc674ee Mon Sep 17 00:00:00 2001 From: =?utf8?q?Alexandre=20Delano=C3=AB?= Date: Thu, 31 May 2018 07:27:00 +0200 Subject: [PATCH 07/16] [CLEAN] imports for pipeline. --- src/Gargantext/Pipeline.hs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/Gargantext/Pipeline.hs b/src/Gargantext/Pipeline.hs index fc341f3a..ba3f4230 100644 --- a/src/Gargantext/Pipeline.hs +++ b/src/Gargantext/Pipeline.hs @@ -16,17 +16,15 @@ module Gargantext.Pipeline where import Data.Text.IO (readFile) - ----------------------------------------------- ---------------------------------------------- -import Gargantext.Core +import Gargantext.Core (Lang(FR)) import Gargantext.Prelude import Gargantext.Viz.Graph.Index (score) import Gargantext.Viz.Graph.Distances.Matrice (distributional) -import Gargantext.Text.Metrics.Occurrences -import Gargantext.Text.Terms -import Gargantext.Text.Context +import Gargantext.Text.Metrics.Occurrences (cooc, removeApax) +import Gargantext.Text.Terms (TermType(Multi), extractTerms) +import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) pipeline path = do -- 2.47.0 From 47ed713fc5344a2a8aed996e5bdf4320b2d40096 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Alexandre=20Delano=C3=AB?= Date: Thu, 31 May 2018 10:23:15 +0200 Subject: [PATCH 08/16] [PIPELINE] adding clustering louvain. --- src/Gargantext/Pipeline.hs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/Gargantext/Pipeline.hs b/src/Gargantext/Pipeline.hs index ba3f4230..b19738f9 100644 --- a/src/Gargantext/Pipeline.hs +++ b/src/Gargantext/Pipeline.hs @@ -16,26 +16,35 @@ module Gargantext.Pipeline where import Data.Text.IO (readFile) +import qualified Data.Map.Strict as M ---------------------------------------------- import Gargantext.Core (Lang(FR)) import Gargantext.Prelude -import Gargantext.Viz.Graph.Index (score) +import Gargantext.Viz.Graph.Index (score, createIndexes, toIndex) import Gargantext.Viz.Graph.Distances.Matrice (distributional) import Gargantext.Text.Metrics.Occurrences (cooc, removeApax) -import Gargantext.Text.Terms (TermType(Multi), extractTerms) +import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms) import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) +import Data.Graph.Clustering.Louvain (bestpartition) +import Data.Graph.Clustering.Louvain.Utils (map2graph) pipeline path = do -- Text <- IO Text <- FilePath text <- readFile path let contexts = splitBy (Sentences 3) text myterms <- extractTerms Multi FR contexts + -- TODO filter (\t -> not . elem t stopList) myterms -- TODO groupBy (Stem | GroupList) + let myCooc = removeApax $ cooc myterms + -- Cooc -> Matrix - pure $ score distributional myCooc + let theScores = M.filter (/=0) $ score distributional myCooc + let (ti, _) = createIndexes theScores + -- Matrix -> Clustering -> Graph -> JSON + pure $ bestpartition False $ map2graph $ toIndex ti theScores -- 2.47.0 From a5bcf8db2c1ef4b702b09af7528dbb26af737182 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Alexandre=20Delano=C3=AB?= Date: Thu, 31 May 2018 10:43:57 +0200 Subject: [PATCH 09/16] [RENAME] name and newtypes for createIndices. --- src/Gargantext/Pipeline.hs | 4 ++-- src/Gargantext/Viz/Graph/Index.hs | 29 ++++++++++++++++++++--------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/Gargantext/Pipeline.hs b/src/Gargantext/Pipeline.hs index b19738f9..ba20344c 100644 --- a/src/Gargantext/Pipeline.hs +++ b/src/Gargantext/Pipeline.hs @@ -21,7 +21,7 @@ import qualified Data.Map.Strict as M import Gargantext.Core (Lang(FR)) import Gargantext.Prelude -import Gargantext.Viz.Graph.Index (score, createIndexes, toIndex) +import Gargantext.Viz.Graph.Index (score, createIndices, toIndex) import Gargantext.Viz.Graph.Distances.Matrice (distributional) import Gargantext.Text.Metrics.Occurrences (cooc, removeApax) import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms) @@ -43,7 +43,7 @@ pipeline path = do -- Cooc -> Matrix let theScores = M.filter (/=0) $ score distributional myCooc - let (ti, _) = createIndexes theScores + let (ti, _) = createIndices theScores -- Matrix -> Clustering -> Graph -> JSON pure $ bestpartition False $ map2graph $ toIndex ti theScores diff --git a/src/Gargantext/Viz/Graph/Index.hs b/src/Gargantext/Viz/Graph/Index.hs index 3ccf4a46..fd693a52 100644 --- a/src/Gargantext/Viz/Graph/Index.hs +++ b/src/Gargantext/Viz/Graph/Index.hs @@ -39,6 +39,8 @@ import qualified Data.Set as S import Data.Map (Map) import qualified Data.Map.Strict as M +import Data.Vector (Vector) + import Gargantext.Prelude type Index = Int @@ -50,7 +52,7 @@ score :: (Ord t) => (A.Matrix Int -> A.Matrix Double) -> Map (t, t) Double score f m = fromIndex fromI . mat2map . f $ cooc2mat toI m where - (toI, fromI) = createIndexes m + (toI, fromI) = createIndices m ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- @@ -79,21 +81,30 @@ toIndex ni ns = indexConversion ni ns fromIndex :: Ord t => Map Index t -> Map (Index, Index) a -> Map (t,t) a fromIndex ni ns = indexConversion ni ns ---------------------------------------------------------------------------------- + indexConversion :: (Ord b, Ord k) => Map k b -> Map (k,k) a -> Map (b, b) a indexConversion index ms = M.fromList $ map (\((k1,k2),c) -> ( ((M.!) index k1, (M.!) index k2), c)) (M.toList ms) +--------------------------------------------------------------------------------- + ------------------------------------------------------------------------------- -------------------------------------------------------------------------------- -createIndexes :: Ord t => Map (t, t) b -> (Map t Index, Map Index t) -createIndexes = set2indexes . cooc2set +-- TODO +fromIndex' :: Ord t => Vector t -> Map (Index, Index) a -> Map (t,t) a +fromIndex' vi ns = undefined + +-- TODO +createIndices' :: Ord t => Map (t, t) b -> (Map t Index, Vector t) +createIndices' = undefined + +createIndices :: Ord t => Map (t, t) b -> (Map t Index, Map Index t) +createIndices = set2indices . map2set where - cooc2set :: Ord t => Map (t, t) a -> Set t - cooc2set cs' = foldl' (\s ((t1,t2),_) -> insert [t1,t2] s ) S.empty (M.toList cs') + map2set :: Ord t => Map (t, t) a -> Set t + map2set cs' = foldl' (\s ((t1,t2),_) -> insert [t1,t2] s ) S.empty (M.toList cs') where insert as s = foldl' (\s' t -> S.insert t s') s as - set2indexes :: Ord t => Set t -> (Map t Index, Map Index t) - set2indexes s = (M.fromList toIndex', M.fromList fromIndex') + set2indices :: Ord t => Set t -> (Map t Index, Map Index t) + set2indices s = (M.fromList toIndex', M.fromList fromIndex') where fromIndex' = zip [0..] xs toIndex' = zip xs [0..] -- 2.47.0 From 00344aaf2b97c9b4105f46574703633d5b097b54 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Alexandre=20Delano=C3=AB?= Date: Thu, 31 May 2018 10:49:51 +0200 Subject: [PATCH 10/16] [TERMS] adding type MonoMulti. --- src/Gargantext/Text/Terms.hs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/Gargantext/Text/Terms.hs b/src/Gargantext/Text/Terms.hs index 112de26d..4f17555f 100644 --- a/src/Gargantext/Text/Terms.hs +++ b/src/Gargantext/Text/Terms.hs @@ -42,16 +42,23 @@ import Gargantext.Core.Types import Gargantext.Text.Terms.Multi (multiterms) import Gargantext.Text.Terms.Mono (monoterms') -data TermType = Mono | Multi +data TermType = Mono | Multi | MonoMulti -- remove Stop Words -- map (filter (\t -> not . elem t)) $ ------------------------------------------------------------------------ +-- | Sugar to extract terms from text (hiddeng mapM from end user). extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms]) extractTerms termType lang = mapM (terms termType lang) ------------------------------------------------------------------------ +-- | Terms from Text +-- Mono : mono terms +-- Multi : multi terms +-- MonoMulti : mono and multi +-- TODO : multi terms should exclude mono (intersection is not empty yet) terms :: TermType -> Lang -> Text -> IO [Terms] -terms Mono lang txt = pure $ monoterms' lang txt -terms Multi lang txt = multiterms lang txt +terms Mono lang txt = pure $ monoterms' lang txt +terms Multi lang txt = multiterms lang txt +terms MonoMulti lang txt = terms Multi lang txt ------------------------------------------------------------------------ -- 2.47.0 From 09cf291732121325305090854d465e848b16a47a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Alexandre=20Delano=C3=AB?= Date: Thu, 31 May 2018 22:01:53 +0200 Subject: [PATCH 11/16] [Pipeline] clustering with C++ Louvain bindings, ok. --- src/Gargantext/Pipeline.hs | 18 +++++++-------- src/Gargantext/Viz/Graph.hs | 5 ---- src/Gargantext/Viz/Graph/Distances/Matrice.hs | 23 +++++++++++++------ 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/Gargantext/Pipeline.hs b/src/Gargantext/Pipeline.hs index ba20344c..a245f972 100644 --- a/src/Gargantext/Pipeline.hs +++ b/src/Gargantext/Pipeline.hs @@ -22,17 +22,16 @@ import Gargantext.Core (Lang(FR)) import Gargantext.Prelude import Gargantext.Viz.Graph.Index (score, createIndices, toIndex) -import Gargantext.Viz.Graph.Distances.Matrice (distributional) +import Gargantext.Viz.Graph.Distances.Matrice (conditional) import Gargantext.Text.Metrics.Occurrences (cooc, removeApax) import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms) import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) -import Data.Graph.Clustering.Louvain (bestpartition) -import Data.Graph.Clustering.Louvain.Utils (map2graph) +import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain) pipeline path = do -- Text <- IO Text <- FilePath - text <- readFile path + text <- readFile path let contexts = splitBy (Sentences 3) text myterms <- extractTerms Multi FR contexts @@ -40,11 +39,12 @@ pipeline path = do -- TODO groupBy (Stem | GroupList) let myCooc = removeApax $ cooc myterms - -- Cooc -> Matrix - let theScores = M.filter (/=0) $ score distributional myCooc + let theScores = M.take 350 $ M.filter (>0) $ score conditional myCooc let (ti, _) = createIndices theScores - - -- Matrix -> Clustering -> Graph -> JSON - pure $ bestpartition False $ map2graph $ toIndex ti theScores +-- +---- -- Matrix -> Clustering -> Graph -> JSON +---- pure $ bestpartition False $ map2graph $ toIndex ti theScores + partitions <- cLouvain $ toIndex ti theScores + pure partitions diff --git a/src/Gargantext/Viz/Graph.hs b/src/Gargantext/Viz/Graph.hs index c1520ed0..332d6551 100644 --- a/src/Gargantext/Viz/Graph.hs +++ b/src/Gargantext/Viz/Graph.hs @@ -60,8 +60,3 @@ $(deriveJSON (unPrefix "g_") ''Graph) - - - - - diff --git a/src/Gargantext/Viz/Graph/Distances/Matrice.hs b/src/Gargantext/Viz/Graph/Distances/Matrice.hs index f5b26298..a7ed1c83 100644 --- a/src/Gargantext/Viz/Graph/Distances/Matrice.hs +++ b/src/Gargantext/Viz/Graph/Distances/Matrice.hs @@ -89,8 +89,21 @@ type Matrix' a = Acc (Matrix a) type InclusionExclusion = Double type SpecificityGenericity = Double -conditional :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity) -conditional m = (run $ ie (use m), run $ sg (use m)) + +miniMax :: Matrix' Double -> Matrix' Double +miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m + where + miniMax' = (the $ minimum $ maximum m) + +conditional :: Matrix Int -> Matrix Double +conditional m = run (miniMax $ proba r $ map fromIntegral $ use m) + where + r :: Rank + r = rank' m + + +conditional' :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity) +conditional' m = (run $ ie (use m), run $ sg (use m)) where ie :: Matrix' Double -> Matrix' Double @@ -115,14 +128,10 @@ conditional m = (run $ ie (use m), run $ sg (use m)) -- | Distributional Distance distributional :: Matrix Int -> Matrix Double -distributional m = run $ filter $ ri (map fromIntegral $ use m) +distributional m = run $ miniMax $ ri (map fromIntegral $ use m) where n = rank' m - miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m - where - miniMax' = (the $ minimum $ maximum m) - filter m = zipWith (\a b -> max a b) m (transpose m) ri mat = zipWith (/) mat1 mat2 -- 2.47.0 From c34120e3f0b2f47f1225f049a01337c2a5e7b48b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Alexandre=20Delano=C3=AB?= Date: Fri, 1 Jun 2018 17:05:04 +0200 Subject: [PATCH 12/16] [FIX] fix cooc behavior. --- src/Gargantext/Pipeline.hs | 48 +++++++++++--- src/Gargantext/Prelude.hs | 2 - src/Gargantext/Text/Metrics.hs | 65 +++++++++++++++++-- src/Gargantext/Text/Metrics/Occurrences.hs | 24 +++---- src/Gargantext/Text/Terms/Multi/PosTagging.hs | 6 +- src/Gargantext/Viz/Graph/Distances/Matrice.hs | 18 ++++- 6 files changed, 132 insertions(+), 31 deletions(-) diff --git a/src/Gargantext/Pipeline.hs b/src/Gargantext/Pipeline.hs index a245f972..1c5a7d72 100644 --- a/src/Gargantext/Pipeline.hs +++ b/src/Gargantext/Pipeline.hs @@ -16,19 +16,44 @@ module Gargantext.Pipeline where import Data.Text.IO (readFile) + +import Control.Arrow ((***)) +import Data.Map.Strict (Map) import qualified Data.Map.Strict as M +import qualified Data.Set as S +import qualified Data.List as L +import Data.Tuple.Extra (both) ---------------------------------------------- import Gargantext.Core (Lang(FR)) import Gargantext.Prelude -import Gargantext.Viz.Graph.Index (score, createIndices, toIndex) -import Gargantext.Viz.Graph.Distances.Matrice (conditional) +import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, mat2map) +import Gargantext.Viz.Graph.Distances.Matrice (incExcSpeGen, conditional) +import Gargantext.Viz.Graph.Index (Index) import Gargantext.Text.Metrics.Occurrences (cooc, removeApax) import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms) import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain) +--filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int +--filterCooc m = +---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection +----(ti, fi) = createIndices m +-- . fromIndex fi $ filterMat $ cooc2mat ti m + + + +import Data.Array.Accelerate (Matrix) +filterMat :: Matrix Int -> [(Index, Index)] +filterMat m = S.toList $ S.take n $ S.fromList $ (L.take nIe incExc') <> (L.take nSg speGen') + where + (incExc', speGen') = both ( map fst . L.sortOn snd . M.toList . mat2map) (incExcSpeGen m) + n = nIe + nSg + nIe = 30 + nSg = 70 + + pipeline path = do -- Text <- IO Text <- FilePath text <- readFile path @@ -39,12 +64,17 @@ pipeline path = do -- TODO groupBy (Stem | GroupList) let myCooc = removeApax $ cooc myterms + let (ti, fi) = createIndices myCooc + pure ti -- Cooc -> Matrix - let theScores = M.take 350 $ M.filter (>0) $ score conditional myCooc - let (ti, _) = createIndices theScores --- ----- -- Matrix -> Clustering -> Graph -> JSON ----- pure $ bestpartition False $ map2graph $ toIndex ti theScores - partitions <- cLouvain $ toIndex ti theScores - pure partitions + +-- -- filter by spec/gen (dynmaic programming) +-- let theScores = M.filter (>0) $ score conditional myCoocFiltered +---- +------ -- Matrix -> Clustering +------ pure $ bestpartition False $ map2graph $ toIndex ti theScores +-- partitions <- cLouvain theScores +-- pure partitions +---- | Building : -> Graph -> JSON + diff --git a/src/Gargantext/Prelude.hs b/src/Gargantext/Prelude.hs index 4cad2f18..6b12fdd6 100644 --- a/src/Gargantext/Prelude.hs +++ b/src/Gargantext/Prelude.hs @@ -237,5 +237,3 @@ unMaybe = map fromJust . L.filter isJust -- maximumWith maximumWith f = L.maximumBy (\x y -> compare (f x) (f y)) - - diff --git a/src/Gargantext/Text/Metrics.hs b/src/Gargantext/Text/Metrics.hs index 4904a5cf..c37e5ac8 100644 --- a/src/Gargantext/Text/Metrics.hs +++ b/src/Gargantext/Text/Metrics.hs @@ -14,12 +14,69 @@ Mainly reexport functions in @Data.Text.Metrics@ module Gargantext.Text.Metrics where ---import Data.Text (Text) +import Data.Text (Text, pack) +import Data.List (concat) + --import GHC.Real (Ratio) --import qualified Data.Text.Metrics as DTM --- ---import Gargantext.Prelude --- + +import Gargantext.Prelude + +import Gargantext.Text.Metrics.Occurrences (occurrences, cooc) +import Gargantext.Text.Terms (TermType(Multi), terms) +import Gargantext.Core (Lang(EN)) +import Gargantext.Core.Types (Terms(..)) +import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) + --noApax :: Ord a => Map a Occ -> Map a Occ --noApax m = M.filter (>1) m + +metrics_text :: Text +metrics_text = "A table is an object. A glas is an object. The glas is on the table. The spoon is an object. The spoon is on the table." + +-- | Sentences +metrics_sentences :: [Text] +metrics_sentences = [ "A table is an object." + , "A glas is an object." + , "The glas is on the table." + , "The spoon is an object." + , "The spoon is on the table." + ] + + +metrics_sentences_Test = splitBy (Sentences 0) metrics_text == metrics_sentences + +-- | Terms reordered to visually check occurrences +metrics_terms :: [[[Text]]] +metrics_terms = [[["table"],["object"] ] + ,[ ["object"],["glas"] ] + ,[["table"], ["glas"] ] + ,[ ["object"], ["spoon"]] + ,[["table"], ["spoon"]] + ] +--metrics_terms_Test = (mapM (terms Multi EN) $ splitBy (Sentences 0) metrics_text) == metrics_terms + +-- | Occurrences +{- +fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])] + , (fromList ["object"],fromList [(["object"], 3 )]) + , (fromList ["glas"] ,fromList [(["glas"] , 2 )]) + , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )]) +-} +metrics_occ = occurrences <$> concat <$> (mapM (terms Multi EN) $ splitBy (Sentences 0) metrics_text) + +{- +-- fromList [((["glas"],["object"]),6) + ,((["glas"],["spoon"]),4) + ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)] + +-} +metrics_cooc = cooc <$> (mapM (terms Multi EN) $ splitBy (Sentences 0) metrics_text) + +metrics_cooc' = (mapM (terms Multi EN) $ splitBy (Sentences 0) "The table object. The table object.") + + + + + diff --git a/src/Gargantext/Text/Metrics/Occurrences.hs b/src/Gargantext/Text/Metrics/Occurrences.hs index 00a0f429..dbfc692f 100644 --- a/src/Gargantext/Text/Metrics/Occurrences.hs +++ b/src/Gargantext/Text/Metrics/Occurrences.hs @@ -82,9 +82,9 @@ removeApax = DMS.filter (> 1) cooc :: [[Terms]] -> Map (Label, Label) Int cooc tss = - mapKeys (delta $ labelPolicy terms_occs) $ cooc' (map (Set.fromList . map _terms_stem) tss) + mapKeys (delta $ labelPolicy terms_occs) $ coocOn _terms_stem tss where - terms_occs = occurrences (List.concat tss) + terms_occs = occurrencesOn _terms_stem (List.concat tss) delta f = f *** f @@ -93,24 +93,26 @@ labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList Just label -> label Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g) -cooc' :: Ord b => [Set b] -> Map (b, b) Coocs -cooc' tss = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs +coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs +coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as + +coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Coocs +coocOn' f ts = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs where + ts' = List.nub $ map f ts xs = [ ((x, y), 1) - | xs <- tss - , ys <- tss - , x <- Set.toList xs - , y <- Set.toList ys + | x <- ts' + , y <- ts' , x < y ] -- | Compute the grouped occurrences (occ) occurrences :: [Terms] -> Map Grouped (Map Terms Int) -occurrences = occurrences' _terms_stem +occurrences = occurrencesOn _terms_stem -occurrences' :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int) -occurrences' f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty +occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int) +occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty -- TODO add groups and filter stops sumOcc :: Ord a => [Occ a] -> Occ a diff --git a/src/Gargantext/Text/Terms/Multi/PosTagging.hs b/src/Gargantext/Text/Terms/Multi/PosTagging.hs index 3622063f..b6dceec7 100644 --- a/src/Gargantext/Text/Terms/Multi/PosTagging.hs +++ b/src/Gargantext/Text/Terms/Multi/PosTagging.hs @@ -96,10 +96,10 @@ data Properties = Properties { _propertiesAnnotators :: Text $(deriveJSON (unPrefix "_properties") ''Properties) -data Sentences = Sentences { _sentences :: [Sentence]} +data PosSentences = PosSentences { _sentences :: [Sentence]} deriving (Show, Generic) -$(deriveJSON (unPrefix "_") ''Sentences) +$(deriveJSON (unPrefix "_") ''PosSentences) -- request = @@ -134,7 +134,7 @@ corenlpRaw lang txt = do pure (getResponseBody response) -corenlp :: Lang -> Text -> IO Sentences +corenlp :: Lang -> Text -> IO PosSentences corenlp lang txt = do response <- corenlp' lang txt pure (getResponseBody response) diff --git a/src/Gargantext/Viz/Graph/Distances/Matrice.hs b/src/Gargantext/Viz/Graph/Distances/Matrice.hs index a7ed1c83..1aa4db24 100644 --- a/src/Gargantext/Viz/Graph/Distances/Matrice.hs +++ b/src/Gargantext/Viz/Graph/Distances/Matrice.hs @@ -102,8 +102,22 @@ conditional m = run (miniMax $ proba r $ map fromIntegral $ use m) r = rank' m -conditional' :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity) -conditional' m = (run $ ie (use m), run $ sg (use m)) +{- +Metric Specificity and genericty: select terms + Compute genericity/specificity: + P(j|i) = N(ij) / N(ii) + P(i|j) = N(ij) / N(jj) + + Gen(i) = Mean{j} P(j_k|i) + Spec(i) = Mean{j} P(i|j_k) + + Gen-clusion(i) = (Spec(i) + Gen(i)) / 2 + Spec-clusion(i) = (Spec(i) - Gen(i)) / 2 + +-} + +incExcSpeGen :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity) +incExcSpeGen m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m) where ie :: Matrix' Double -> Matrix' Double -- 2.47.0 From 6551bf9017a3141cce92604e799d579044063222 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Alexandre=20Delano=C3=AB?= Date: Sat, 2 Jun 2018 22:50:52 +0200 Subject: [PATCH 13/16] [COUNT] renaming file and generic function. --- src/Gargantext/Pipeline.hs | 2 +- src/Gargantext/Text.hs | 2 +- src/Gargantext/Text/Metrics.hs | 43 +++++++++++-------- .../Text/Metrics/{Occurrences.hs => Count.hs} | 34 +++++++++------ src/Gargantext/Viz/Graph/Distances/Matrice.hs | 2 +- 5 files changed, 47 insertions(+), 36 deletions(-) rename src/Gargantext/Text/Metrics/{Occurrences.hs => Count.hs} (86%) diff --git a/src/Gargantext/Pipeline.hs b/src/Gargantext/Pipeline.hs index 1c5a7d72..c12bbace 100644 --- a/src/Gargantext/Pipeline.hs +++ b/src/Gargantext/Pipeline.hs @@ -30,7 +30,7 @@ import Gargantext.Prelude import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, mat2map) import Gargantext.Viz.Graph.Distances.Matrice (incExcSpeGen, conditional) import Gargantext.Viz.Graph.Index (Index) -import Gargantext.Text.Metrics.Occurrences (cooc, removeApax) +import Gargantext.Text.Metrics.Count (cooc, removeApax) import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms) import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) diff --git a/src/Gargantext/Text.hs b/src/Gargantext/Text.hs index 11ca201f..c31aedc0 100644 --- a/src/Gargantext/Text.hs +++ b/src/Gargantext/Text.hs @@ -27,7 +27,7 @@ import NLP.FullStop (segment) ----------------------------------------------------------------- import Gargantext.Core import Gargantext.Core.Types -import Gargantext.Text.Metrics.Occurrences (Occ, occurrences, cooc) +import Gargantext.Text.Metrics.Count (Occ, occurrences, cooc) import Gargantext.Prelude hiding (filter) ----------------------------------------------------------------- diff --git a/src/Gargantext/Text/Metrics.hs b/src/Gargantext/Text/Metrics.hs index c37e5ac8..925dea93 100644 --- a/src/Gargantext/Text/Metrics.hs +++ b/src/Gargantext/Text/Metrics.hs @@ -15,6 +15,7 @@ Mainly reexport functions in @Data.Text.Metrics@ module Gargantext.Text.Metrics where import Data.Text (Text, pack) +import qualified Data.Text as T import Data.List (concat) --import GHC.Real (Ratio) @@ -22,7 +23,7 @@ import Data.List (concat) import Gargantext.Prelude -import Gargantext.Text.Metrics.Occurrences (occurrences, cooc) +import Gargantext.Text.Metrics.Count (occurrences, cooc) import Gargantext.Text.Terms (TermType(Multi), terms) import Gargantext.Core (Lang(EN)) import Gargantext.Core.Types (Terms(..)) @@ -33,29 +34,35 @@ import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) metrics_text :: Text -metrics_text = "A table is an object. A glas is an object. The glas is on the table. The spoon is an object. The spoon is on the table." +metrics_text = T.concat ["A table is an object." + ,"A glas is an object too." + ,"Using a glas to dring is a function." + ,"Using a spoon to eat is a function." + ,"The spoon is an object to eat." + ] + +metrics_sentences' :: [Text] +metrics_sentences' = splitBy (Sentences 0) metrics_text -- | Sentences metrics_sentences :: [Text] -metrics_sentences = [ "A table is an object." - , "A glas is an object." - , "The glas is on the table." - , "The spoon is an object." - , "The spoon is on the table." - ] +metrics_sentences = ["A table is an object." + ,"A glas is an object too." + ,"The glas and the spoon are on the table." + ,"The spoon is an object to eat." + ,"The spoon is on the table and the plate and the glas."] -metrics_sentences_Test = splitBy (Sentences 0) metrics_text == metrics_sentences +metrics_sentences_Test = metrics_sentences == metrics_sentences' -- | Terms reordered to visually check occurrences -metrics_terms :: [[[Text]]] -metrics_terms = [[["table"],["object"] ] - ,[ ["object"],["glas"] ] - ,[["table"], ["glas"] ] - ,[ ["object"], ["spoon"]] - ,[["table"], ["spoon"]] - ] ---metrics_terms_Test = (mapM (terms Multi EN) $ splitBy (Sentences 0) metrics_text) == metrics_terms +metrics_terms :: [[Text]] +metrics_terms = undefined + +metrics_terms' :: IO [[Terms]] +metrics_terms' = mapM (terms Multi EN) $ splitBy (Sentences 0) metrics_text + +--metrics_terms_Test = metrics_terms == ((map _terms_label) <$> metrics_terms') -- | Occurrences {- @@ -78,5 +85,3 @@ metrics_cooc' = (mapM (terms Multi EN) $ splitBy (Sentences 0) "The table object - - diff --git a/src/Gargantext/Text/Metrics/Occurrences.hs b/src/Gargantext/Text/Metrics/Count.hs similarity index 86% rename from src/Gargantext/Text/Metrics/Occurrences.hs rename to src/Gargantext/Text/Metrics/Count.hs index dbfc692f..eccc04c9 100644 --- a/src/Gargantext/Text/Metrics/Occurrences.hs +++ b/src/Gargantext/Text/Metrics/Count.hs @@ -1,5 +1,5 @@ {-| -Module : Gargantext.Text.Metrics.Occurrences +Module : Gargantext.Text.Metrics.Count Description : Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 @@ -25,7 +25,7 @@ Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrence {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE OverloadedStrings #-} -module Gargantext.Text.Metrics.Occurrences +module Gargantext.Text.Metrics.Count where @@ -71,7 +71,6 @@ type Grouped = Stems --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"] --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)] ---- - -} type Occs = Int @@ -81,10 +80,16 @@ removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int removeApax = DMS.filter (> 1) cooc :: [[Terms]] -> Map (Label, Label) Int -cooc tss = - mapKeys (delta $ labelPolicy terms_occs) $ coocOn _terms_stem tss +cooc tss = coocOnWithLabel _terms_stem (labelPolicy terms_occs) tss where terms_occs = occurrencesOn _terms_stem (List.concat tss) + + +coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label) + -> [[a]] -> Map (label, label) Coocs +coocOnWithLabel on policy tss = + mapKeys (delta policy) $ coocOn on tss + where delta f = f *** f @@ -95,16 +100,16 @@ labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as - -coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Coocs -coocOn' f ts = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs where - ts' = List.nub $ map f ts - xs = [ ((x, y), 1) - | x <- ts' - , y <- ts' - , x < y - ] + coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Coocs + coocOn' f ts = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs + where + ts' = List.nub $ map f ts + xs = [ ((x, y), 1) + | x <- ts' + , y <- ts' + , x < y + ] -- | Compute the grouped occurrences (occ) @@ -115,6 +120,7 @@ occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int) occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty -- TODO add groups and filter stops + sumOcc :: Ord a => [Occ a] -> Occ a sumOcc xs = foldl' (unionWith (+)) empty xs diff --git a/src/Gargantext/Viz/Graph/Distances/Matrice.hs b/src/Gargantext/Viz/Graph/Distances/Matrice.hs index 1aa4db24..f010d5dd 100644 --- a/src/Gargantext/Viz/Graph/Distances/Matrice.hs +++ b/src/Gargantext/Viz/Graph/Distances/Matrice.hs @@ -46,7 +46,7 @@ import Data.Maybe (Maybe(Just)) import qualified Gargantext.Prelude as P import qualified Data.Array.Accelerate.Array.Representation as Repr -import Gargantext.Text.Metrics.Occurrences +import Gargantext.Text.Metrics.Count ----------------------------------------------------------------------- -- 2.47.0 From 42ab55b919c26ac11d3a7e8c6ca724ac189f8bdb Mon Sep 17 00:00:00 2001 From: =?utf8?q?Alexandre=20Delano=C3=AB?= Date: Sat, 2 Jun 2018 22:58:08 +0200 Subject: [PATCH 14/16] [FIX] Corenlp lems -> using stem. --- src/Gargantext/Text/Terms/Multi.hs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/Gargantext/Text/Terms/Multi.hs b/src/Gargantext/Text/Terms/Multi.hs index f92dd61a..ee676498 100644 --- a/src/Gargantext/Text/Terms/Multi.hs +++ b/src/Gargantext/Text/Terms/Multi.hs @@ -18,23 +18,27 @@ module Gargantext.Text.Terms.Multi (multiterms) import Data.Text hiding (map, group, filter, concat) import Data.List (concat) +import qualified Data.Set as S import Gargantext.Prelude import Gargantext.Core (Lang(..)) import Gargantext.Core.Types import Gargantext.Text.Terms.Multi.PosTagging +import Gargantext.Text.Terms.Mono.Stem (stem) import qualified Gargantext.Text.Terms.Multi.Lang.En as En import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr multiterms :: Lang -> Text -> IO [Terms] multiterms lang txt = concat - <$> map (map tokenTag2terms) + <$> map (map (tokenTag2terms lang)) <$> map (filter (\t -> _my_token_pos t == Just NP)) <$> tokenTags lang txt -tokenTag2terms :: TokenTag -> Terms -tokenTag2terms (TokenTag w t _ _) = Terms w t +tokenTag2terms :: Lang -> TokenTag -> Terms +tokenTag2terms lang (TokenTag w t _ _) = Terms w t' + where + t' = S.fromList $ map (stem lang) $ S.toList t tokenTags :: Lang -> Text -> IO [[TokenTag]] tokenTags lang s = map (group lang) <$> tokenTags' lang s -- 2.47.0 From 58c1c24985421cd1d4e5bb422eaa196aa014bebd Mon Sep 17 00:00:00 2001 From: Nicolas Pouillard Date: Mon, 4 Jun 2018 13:43:52 +0200 Subject: [PATCH 15/16] Basic polymorphic version of FrequentItemSet --- .../Text/Metrics/FrequentItemSet.hs | 61 +++++++++++++------ 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/src/Gargantext/Text/Metrics/FrequentItemSet.hs b/src/Gargantext/Text/Metrics/FrequentItemSet.hs index f4ab2b84..105250c1 100644 --- a/src/Gargantext/Text/Metrics/FrequentItemSet.hs +++ b/src/Gargantext/Text/Metrics/FrequentItemSet.hs @@ -14,42 +14,48 @@ Domain Specific Language to manage Frequent Item Set (FIS) {-# LANGUAGE NoImplicitPrelude #-} module Gargantext.Text.Metrics.FrequentItemSet - ( Fis, Size + ( Fis, Size(..) , occ_hlcm, cooc_hlcm , all, between , module HLCM ) where -import Data.List (tail, filter) -import Data.Either +import Prelude (Functor(..)) -- TODO +import qualified Data.Map.Strict as Map +import Data.Map.Strict (Map) +import qualified Data.Set as Set +import Data.Set (Set) +import qualified Data.Vector as V +import Data.Vector (Vector) + +import Data.List (filter, concat) +import Data.Maybe (catMaybes) import HLCM import Gargantext.Prelude -type Size = Either Int (Int, Int) - ---data Size = Point | Segment +data Size = Point Int | Segment Int Int ------------------------------------------------------------------------ -- | Occurrence is Frequent Item Set of size 1 occ_hlcm :: Frequency -> [[Item]] -> [Fis] -occ_hlcm f is = fisWithSize (Left 1) f is +occ_hlcm = fisWithSize (Point 1) -- | Cooccurrence is Frequent Item Set of size 2 cooc_hlcm :: Frequency -> [[Item]] -> [Fis] -cooc_hlcm f is = fisWithSize (Left 2) f is +cooc_hlcm = fisWithSize (Point 2) all :: Frequency -> [[Item]] -> [Fis] -all f is = fisWith Nothing f is +all = fisWith Nothing ------------------------------------------------------------------------ between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis] -between (x,y) f is = fisWithSize (Right (x,y)) f is +between (x,y) = fisWithSize (Segment x y) --maximum :: Int -> Frequency -> [[Item]] -> [Fis] ---maximum m f is = between (0,m) f is +--maximum m = between (0,m) ------------------------------------------------------------------------ @@ -62,31 +68,48 @@ data Fis' a = Fis' { _fisCount :: Int , _fisItemSet :: [a] } deriving (Show) +instance Functor Fis' where + fmap f (Fis' c is) = Fis' c (fmap f is) + -- | Sugar from items to FIS items2fis :: [Item] -> Maybe Fis -items2fis is = case head is of - Nothing -> Nothing - Just h -> Just (Fis' h (tail is)) +items2fis [] = Nothing +items2fis (i:is) = Just $ Fis' i is ------------------------------------------------------------------------ ------------------------------------------------------------------------ fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis] fisWithSize n f is = case n of - Left n' -> fisWith (Just (\x -> length x == (n'+1) )) f is - Right (a,b) -> fisWith (Just (\x -> cond1 a x && cond2 b x)) f is + Point n' -> fisWith (Just (\x -> length x == (n'+1) )) f is + Segment a b -> fisWith (Just (\x -> cond a (length x) b)) f is where - cond1 a' x = length x >= a' - cond2 b' x = length x <= b' + cond a' x b' = a' <= x && x <= b' fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis] -fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f +fisWith s f is = catMaybes $ map items2fis $ filter' $ runLCMmatrix is f where filter' = case s of Nothing -> identity Just fun -> filter fun +-- Here the sole purpose to take the keys as a Set is tell we do not want +-- duplicates. +fisWithSizePoly :: Ord a => Size -> Frequency -> Set a -> [[a]] -> [Fis' a] +fisWithSizePoly n f ks = map (fmap fromItem) . fisWithSize n f . map (map toItem) + where + ksv = V.fromList $ Set.toList ks + ksm = Map.fromList . flip zip [0..] $ V.toList ksv + toItem = (ksm Map.!) + fromItem = (ksv V.!) + +fisWithSizePoly2 :: Ord a => Size -> Frequency -> [[a]] -> [Fis' a] +fisWithSizePoly2 n f is = fisWithSizePoly n f ks is + where + ks = Set.fromList $ concat is + + ------------------------------------------------------------------------ ------------------------------------------------------------------------ -- 2.47.0 From 945d8e2e6e82e8b8dfaad8cc55fa237b39c356d3 Mon Sep 17 00:00:00 2001 From: Nicolas Pouillard Date: Mon, 4 Jun 2018 13:45:02 +0200 Subject: [PATCH 16/16] Minor build/packaging tweaks --- package.yaml | 2 +- src/Gargantext/Text/Metrics.hs | 1 + stack.yaml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/package.yaml b/package.yaml index 2de9e6a0..4b5a26cd 100644 --- a/package.yaml +++ b/package.yaml @@ -35,7 +35,6 @@ library: dependencies: - QuickCheck - accelerate - - accelerate-io - aeson - aeson-lens - aeson-pretty @@ -46,6 +45,7 @@ library: - bytestring - case-insensitive - cassava + - clustering-louvain - conduit - conduit-extra - containers diff --git a/src/Gargantext/Text/Metrics.hs b/src/Gargantext/Text/Metrics.hs index 925dea93..08a2db87 100644 --- a/src/Gargantext/Text/Metrics.hs +++ b/src/Gargantext/Text/Metrics.hs @@ -11,6 +11,7 @@ Mainly reexport functions in @Data.Text.Metrics@ -} {-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE OverloadedStrings #-} module Gargantext.Text.Metrics where diff --git a/stack.yaml b/stack.yaml index 23363269..d0c227db 100644 --- a/stack.yaml +++ b/stack.yaml @@ -14,7 +14,7 @@ extra-deps: - git: https://github.com/delanoe/servant-static-th.git commit: fff77e79fe94d563ab5cae2609b78c17b5c1f434 - accelerate-1.2.0.0 -- accelerate-io-1.2.0.0 +- hashtables-1.2.3.0 # needed by accelerate-1.2.0.0 - aeson-1.2.4.0 - aeson-lens-0.5.0.0 - duckling-0.1.3.0 -- 2.47.0