2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 From text to viz, all the flow of texts in Gargantext.
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
17 module Gargantext.TextFlow
20 import qualified Data.Text as T
21 import Data.Text.IO (readFile)
23 import Control.Arrow ((***))
24 import Data.Map.Strict (Map)
25 import qualified Data.Array.Accelerate as A
26 import qualified Data.Map.Strict as M
27 import qualified Data.List as L
28 import Data.Tuple.Extra (both)
29 ----------------------------------------------
30 import Gargantext.Core (Lang(FR))
31 import Gargantext.Core.Types (Label)
32 import Gargantext.Prelude
33 import Prelude (print, seq)
35 import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
36 import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
37 import Gargantext.Viz.Graph.Index (Index)
38 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
39 import Gargantext.Text.Metrics.Count (cooc)
40 import Gargantext.Text.Metrics
41 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
42 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
44 import Gargantext.Text.Parsers.CSV
46 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
51 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
52 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
53 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
54 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
59 data TextFlow = CSV | FullText
61 -- workflow :: Lang (EN|FR) -> FilePath -> Graph
62 textflow termsLang workType path = do
63 -- Text <- IO Text <- FilePath
64 contexts <- case workType of
65 FullText -> splitBy (Sentences 5) <$> readFile path
66 CSV -> readCsvOn [csv_title, csv_abstract] path
68 -- Context :: Text -> [Text]
69 -- Contexts = Paragraphs n | Sentences n | Chars n
71 myterms <- extractTerms (Mono FR) contexts
72 -- TermsType = Mono | Multi | MonoMulti
73 -- myterms # filter (\t -> not . elem t stopList)
74 -- # groupBy (Stem|GroupList|Ontology)
75 printDebug "myterms" (sum $ map length myterms)
77 -- Bulding the map list
78 -- compute copresences of terms
79 -- Cooc = Map (Term, Term) Int
80 let myCooc1 = cooc myterms
81 printDebug "myCooc1" (M.size myCooc1)
83 -- Remove Apax: appears one time only => lighting the matrix
84 let myCooc2 = M.filter (>1) myCooc1
85 printDebug "myCooc2" (M.size myCooc2)
87 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
88 let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 )
94 printDebug "myCooc3" $ M.size myCooc3
97 let (ti, fi) = createIndices myCooc3
98 printDebug "ti" $ M.size ti
100 let myCooc4 = toIndex ti myCooc3
101 printDebug "myCooc4" $ M.size myCooc4
103 let matCooc = map2mat (0) (M.size ti) myCooc4
104 --printDebug "matCooc" matCooc
105 -- Matrix -> Clustering
106 let distanceMat = conditional matCooc
107 -- let distanceMat = distributional matCooc
108 printDebug "distanceMat" $ A.arrayShape distanceMat
109 --printDebug "distanceMat" distanceMat
111 let distanceMap = mat2map distanceMat
112 printDebug "distanceMap" $ M.size distanceMap
114 -- let distance = fromIndex fi distanceMap
115 -- printDebug "distance" $ M.size distance
117 partitions <- cLouvain distanceMap
118 ------ | Building : -> Graph -> JSON
119 printDebug "partitions" $ length partitions
120 --printDebug "partitions" partitions
121 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
124 -----------------------------------------------------------
125 -- distance should not be a map since we just "toList" it (same as cLouvain)
126 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
127 -> Map (Int, Int) Double
130 data2graph labels coocs distance partitions = Graph nodes edges
132 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
133 nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
134 , n_type = Terms -- or Unknown
136 , n_label = T.unwords l
138 Attributes { clust_default = maybe 0 identity
139 (M.lookup n community_id_by_node_id) } }
141 edges = [ Edge { e_source = s
145 | (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
146 -----------------------------------------------------------
148 printDebug msg x = putStrLn $ msg <> " " <> show x
149 --printDebug _ _ = pure ()