2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 From text to viz, all the flow of texts in Gargantext.
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
17 module Gargantext.TextFlow
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
25 import Data.Map.Strict (Map)
26 import qualified Data.Array.Accelerate as A
27 import qualified Data.Map.Strict as M
28 ----------------------------------------------
29 import Gargantext.Core (Lang)
30 import Gargantext.Core.Types (Label)
31 import Gargantext.Prelude
33 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
34 import Gargantext.Viz.Graph.Distances.Matrice (conditional)
35 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
36 import Gargantext.Text.Metrics.Count (cooc)
37 import Gargantext.Text.Metrics
38 import Gargantext.Text.Terms (TermType, extractTerms)
39 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
41 import Gargantext.Text.Parsers.CSV
43 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
48 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
49 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
50 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
51 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
56 data TextFlow = CSV FilePath
64 textFlow :: TermType Lang -> TextFlow -> IO Graph
65 textFlow termType workType = do
66 contexts <- case workType of
67 FullText path -> splitBy (Sentences 5) <$> readFile path
68 CSV path -> readCsvOn [csv_title, csv_abstract] path
69 Contexts ctxt -> pure ctxt
72 textFlow' termType contexts
75 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
76 textFlow' termType contexts = do
77 -- Context :: Text -> [Text]
78 -- Contexts = Paragraphs n | Sentences n | Chars n
80 myterms <- extractTerms termType contexts
81 -- TermsType = Mono | Multi | MonoMulti
82 -- myterms # filter (\t -> not . elem t stopList)
83 -- # groupBy (Stem|GroupList|Ontology)
84 printDebug "myterms" (sum $ map length myterms)
86 -- Bulding the map list
87 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
88 -- Cooc = Map (Term, Term) Int
89 let myCooc1 = cooc myterms
90 printDebug "myCooc1" (M.size myCooc1)
92 -- Remove Apax: appears one time only => lighting the matrix
93 let myCooc2 = M.filter (>1) myCooc1
94 printDebug "myCooc2" (M.size myCooc2)
96 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
97 let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 )
103 printDebug "myCooc3" $ M.size myCooc3
104 -- putStrLn $ show myCooc3
107 let (ti, _) = createIndices myCooc3
108 printDebug "ti" $ M.size ti
110 let myCooc4 = toIndex ti myCooc3
111 printDebug "myCooc4" $ M.size myCooc4
113 let matCooc = map2mat (0) (M.size ti) myCooc4
114 -- printDebug "matCooc" matCooc
115 -- Matrix -> Clustering
116 let distanceMat = conditional matCooc
117 -- let distanceMat = distributional matCooc
118 printDebug "distanceMat" $ A.arrayShape distanceMat
119 -- printDebug "distanceMat" distanceMat
121 let distanceMap = mat2map distanceMat
122 printDebug "distanceMap" $ M.size distanceMap
124 -- let distance = fromIndex fi distanceMap
125 -- printDebug "distance" $ M.size distance
127 partitions <- cLouvain distanceMap
128 -- Building : -> Graph -> JSON
129 printDebug "partitions" $ length partitions
130 --printDebug "partitions" partitions
131 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
133 -----------------------------------------------------------
134 -- | From data to Graph
135 -- FIXME: distance should not be a map since we just "toList" it (same as cLouvain)
136 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
137 -> Map (Int, Int) Double
140 data2graph labels coocs distance partitions = Graph nodes edges
142 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
143 nodes = [ Node { node_size = maybe 0 identity (M.lookup (n,n) coocs)
144 , node_type = Terms -- or Unknown
145 , node_id = cs (show n)
146 , node_label = T.unwords l
148 Attributes { clust_default = maybe 0 identity
149 (M.lookup n community_id_by_node_id) } }
151 edges = [ Edge { edge_source = cs (show s)
152 , edge_target = cs (show t)
154 , edge_id = cs (show i) }
155 | (i, ((s,t), w)) <- zip ([0..]::[Integer]) (M.toList distance) ]
156 -----------------------------------------------------------