2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 From text to viz, all the flow of texts in Gargantext.
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
17 module Gargantext.TextFlow
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
24 import Control.Monad.IO.Class (MonadIO)
26 import Data.Map.Strict (Map)
27 import qualified Data.Array.Accelerate as A
28 import qualified Data.Map.Strict as M
29 ----------------------------------------------
30 import Gargantext.Core (Lang(FR))
31 import Gargantext.Core.Types (Label)
32 import Gargantext.Prelude
34 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
35 import Gargantext.Viz.Graph.Distances.Matrice (conditional)
36 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
37 import Gargantext.Text.Metrics.Count (cooc)
38 import Gargantext.Text.Metrics
39 import Gargantext.Text.Terms (TermType(Mono), extractTerms)
40 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
42 import Gargantext.Text.Parsers.CSV
44 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
49 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
50 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
51 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
52 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
57 printDebug :: (Show a, MonadIO m) => [Char] -> a -> m ()
58 printDebug msg x = putStrLn $ msg <> " " <> show x
59 --printDebug _ _ = pure ()
61 data TextFlow = CSV | FullText
63 -- workflow :: Lang (EN|FR) -> FilePath -> Graph
64 textflow :: Lang -> TextFlow -> FilePath -> IO Graph
65 textflow _ workType path = do
66 -- Text <- IO Text <- FilePath
67 contexts <- case workType of
68 FullText -> splitBy (Sentences 5) <$> readFile path
69 CSV -> readCsvOn [csv_title, csv_abstract] path
71 -- Context :: Text -> [Text]
72 -- Contexts = Paragraphs n | Sentences n | Chars n
74 myterms <- extractTerms (Mono FR) contexts
75 -- TermsType = Mono | Multi | MonoMulti
76 -- myterms # filter (\t -> not . elem t stopList)
77 -- # groupBy (Stem|GroupList|Ontology)
78 printDebug "myterms" (sum $ map length myterms)
80 -- Bulding the map list
81 -- compute copresences of terms
82 -- Cooc = Map (Term, Term) Int
83 let myCooc1 = cooc myterms
84 printDebug "myCooc1" (M.size myCooc1)
86 -- Remove Apax: appears one time only => lighting the matrix
87 let myCooc2 = M.filter (>1) myCooc1
88 printDebug "myCooc2" (M.size myCooc2)
90 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
91 let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 )
97 printDebug "myCooc3" $ M.size myCooc3
100 let (ti, _) = createIndices myCooc3
101 printDebug "ti" $ M.size ti
103 let myCooc4 = toIndex ti myCooc3
104 printDebug "myCooc4" $ M.size myCooc4
106 let matCooc = map2mat (0) (M.size ti) myCooc4
107 --printDebug "matCooc" matCooc
108 -- Matrix -> Clustering
109 let distanceMat = conditional matCooc
110 -- let distanceMat = distributional matCooc
111 printDebug "distanceMat" $ A.arrayShape distanceMat
112 --printDebug "distanceMat" distanceMat
114 let distanceMap = mat2map distanceMat
115 printDebug "distanceMap" $ M.size distanceMap
117 -- let distance = fromIndex fi distanceMap
118 -- printDebug "distance" $ M.size distance
120 partitions <- cLouvain distanceMap
121 ------ | Building : -> Graph -> JSON
122 printDebug "partitions" $ length partitions
123 --printDebug "partitions" partitions
124 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
127 -----------------------------------------------------------
128 -- distance should not be a map since we just "toList" it (same as cLouvain)
129 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
130 -> Map (Int, Int) Double
133 data2graph labels coocs distance partitions = Graph nodes edges
135 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
136 nodes = [ Node { node_size = maybe 0 identity (M.lookup (n,n) coocs)
137 , node_type = Terms -- or Unknown
138 , node_id = cs (show n)
139 , node_label = T.unwords l
141 Attributes { clust_default = maybe 0 identity
142 (M.lookup n community_id_by_node_id) } }
144 edges = [ Edge { edge_source = cs (show s)
145 , edge_target = cs (show t)
147 , edge_id = cs (show i) }
148 | (i, ((s,t), w)) <- zip ([0..]::[Integer]) (M.toList distance) ]
149 -----------------------------------------------------------