2 Module : Gargantext.Pipeline
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
14 module Gargantext.Pipeline
17 import qualified Data.Text as T
18 import Data.Text.IO (readFile)
20 import Control.Arrow ((***))
21 import Data.Map.Strict (Map)
22 import qualified Data.Array.Accelerate as A
23 import qualified Data.Map.Strict as M
24 import qualified Data.List as L
25 import Data.Tuple.Extra (both)
26 ----------------------------------------------
27 import Gargantext.Core (Lang(FR))
28 import Gargantext.Core.Types (Label)
29 import Gargantext.Prelude
30 import Prelude (print, seq)
32 import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
33 import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
34 import Gargantext.Viz.Graph.Index (Index)
35 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
36 import Gargantext.Text.Metrics.Count (cooc)
37 import Gargantext.Text.Metrics
38 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
39 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
41 import Gargantext.Text.Parsers.CSV
43 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
48 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
49 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
50 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
51 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
56 data WorkType = CSV | FullText
58 -- workflow :: Lang (EN|FR) -> FilePath -> Graph
59 workflow termsLang workType path = do
60 -- Text <- IO Text <- FilePath
61 contexts <- case workType of
62 FullText -> splitBy (Sentences 5) <$> readFile path
63 CSV -> readCsvOn [csv_title, csv_abstract] path
65 -- Context :: Text -> [Text]
66 -- Contexts = Paragraphs n | Sentences n | Chars n
68 myterms <- extractTerms (Mono FR) contexts
69 -- TermsType = Mono | Multi | MonoMulti
70 -- myterms # filter (\t -> not . elem t stopList)
71 -- # groupBy (Stem|GroupList|Ontology)
72 printDebug "myterms" (sum $ map length myterms)
74 -- Bulding the map list
75 -- compute copresences of terms
76 -- Cooc = Map (Term, Term) Int
77 let myCooc1 = cooc myterms
78 printDebug "myCooc1" (M.size myCooc1)
80 -- Remove Apax: appears one time only => lighting the matrix
81 let myCooc2 = M.filter (>1) myCooc1
82 printDebug "myCooc2" (M.size myCooc2)
84 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
85 let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 )
91 printDebug "myCooc3" $ M.size myCooc3
94 let (ti, fi) = createIndices myCooc3
95 printDebug "ti" $ M.size ti
97 let myCooc4 = toIndex ti myCooc3
98 printDebug "myCooc4" $ M.size myCooc4
100 let matCooc = map2mat (0) (M.size ti) myCooc4
101 --printDebug "matCooc" matCooc
102 -- Matrix -> Clustering
103 let distanceMat = conditional matCooc
104 -- let distanceMat = distributional matCooc
105 printDebug "distanceMat" $ A.arrayShape distanceMat
106 --printDebug "distanceMat" distanceMat
108 let distanceMap = mat2map distanceMat
109 printDebug "distanceMap" $ M.size distanceMap
111 -- let distance = fromIndex fi distanceMap
112 -- printDebug "distance" $ M.size distance
114 partitions <- cLouvain distanceMap
115 ------ | Building : -> Graph -> JSON
116 printDebug "partitions" $ length partitions
117 --printDebug "partitions" partitions
118 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
121 -----------------------------------------------------------
122 -- distance should not be a map since we just "toList" it (same as cLouvain)
123 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
124 -> Map (Int, Int) Double
127 data2graph labels coocs distance partitions = Graph nodes edges
129 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
130 nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
131 , n_type = Terms -- or Unknown
133 , n_label = T.unwords l
135 Attributes { clust_default = maybe 0 identity
136 (M.lookup n community_id_by_node_id) } }
138 edges = [ Edge { e_source = s
142 | (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
143 -----------------------------------------------------------
145 printDebug msg x = putStrLn $ msg <> " " <> show x
146 --printDebug _ _ = pure ()