2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 From text to viz, all the flow of texts in Gargantext.
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
17 module Gargantext.TextFlow
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
24 import Control.Monad.IO.Class (MonadIO)
26 import Data.Map.Strict (Map)
27 import qualified Data.Array.Accelerate as A
28 import qualified Data.Map.Strict as M
29 ----------------------------------------------
30 import Gargantext.Core (Lang)
31 import Gargantext.Core.Types (Label)
32 import Gargantext.Prelude
34 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
35 import Gargantext.Viz.Graph.Distances.Matrice (conditional)
36 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
37 import Gargantext.Text.Metrics.Count (cooc)
38 import Gargantext.Text.Metrics
39 import Gargantext.Text.Terms (TermType, extractTerms)
40 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
42 import Gargantext.Text.Parsers.CSV
44 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
49 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
50 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
51 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
52 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
56 printDebug :: (Show a, MonadIO m) => [Char] -> a -> m ()
57 printDebug msg x = putStrLn $ msg <> " " <> show x
58 --printDebug _ _ = pure ()
60 data TextFlow = CSV FilePath
65 -- | ExtDatabase Query
66 -- | IntDatabase NodeId
68 textFlow :: TermType Lang -> TextFlow -> IO Graph
69 textFlow termType workType = do
70 contexts <- case workType of
71 FullText path -> splitBy (Sentences 5) <$> readFile path
72 CSV path -> readCsvOn [csv_title, csv_abstract] path
73 Contexts ctxt -> pure ctxt
76 textFlow' termType contexts
79 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
80 textFlow' termType contexts = do
81 -- Context :: Text -> [Text]
82 -- Contexts = Paragraphs n | Sentences n | Chars n
84 myterms <- extractTerms termType contexts
85 -- TermsType = Mono | Multi | MonoMulti
86 -- myterms # filter (\t -> not . elem t stopList)
87 -- # groupBy (Stem|GroupList|Ontology)
88 printDebug "myterms" (sum $ map length myterms)
90 -- Bulding the map list
91 -- compute copresences of terms
92 -- Cooc = Map (Term, Term) Int
93 let myCooc1 = cooc myterms
94 printDebug "myCooc1" (M.size myCooc1)
96 -- Remove Apax: appears one time only => lighting the matrix
97 let myCooc2 = M.filter (>1) myCooc1
98 printDebug "myCooc2" (M.size myCooc2)
100 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
101 let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 )
102 (InclusionSize 4000 )
107 printDebug "myCooc3" $ M.size myCooc3
110 let (ti, _) = createIndices myCooc3
111 printDebug "ti" $ M.size ti
113 let myCooc4 = toIndex ti myCooc3
114 printDebug "myCooc4" $ M.size myCooc4
116 let matCooc = map2mat (0) (M.size ti) myCooc4
117 --printDebug "matCooc" matCooc
118 -- Matrix -> Clustering
119 let distanceMat = conditional matCooc
120 -- let distanceMat = distributional matCooc
121 printDebug "distanceMat" $ A.arrayShape distanceMat
122 --printDebug "distanceMat" distanceMat
124 let distanceMap = mat2map distanceMat
125 printDebug "distanceMap" $ M.size distanceMap
127 -- let distance = fromIndex fi distanceMap
128 -- printDebug "distance" $ M.size distance
130 partitions <- cLouvain distanceMap
131 ------ | Building : -> Graph -> JSON
132 printDebug "partitions" $ length partitions
133 --printDebug "partitions" partitions
134 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
136 -----------------------------------------------------------
137 -- distance should not be a map since we just "toList" it (same as cLouvain)
138 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
139 -> Map (Int, Int) Double
142 data2graph labels coocs distance partitions = Graph nodes edges
144 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
145 nodes = [ Node { node_size = maybe 0 identity (M.lookup (n,n) coocs)
146 , node_type = Terms -- or Unknown
147 , node_id = cs (show n)
148 , node_label = T.unwords l
150 Attributes { clust_default = maybe 0 identity
151 (M.lookup n community_id_by_node_id) } }
153 edges = [ Edge { edge_source = cs (show s)
154 , edge_target = cs (show t)
156 , edge_id = cs (show i) }
157 | (i, ((s,t), w)) <- zip ([0..]::[Integer]) (M.toList distance) ]
158 -----------------------------------------------------------