2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 From text to viz, all the flow of texts in Gargantext.
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
17 module Gargantext.TextFlow
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
24 import Data.Maybe (catMaybes)
25 import qualified Data.Set as DS
27 import qualified Data.Array.Accelerate as A
28 import qualified Data.Map.Strict as M
29 ----------------------------------------------
30 import Gargantext.Database (Connection)
32 import Gargantext.Database.Node
33 import Gargantext.Core.Types.Node
35 import Gargantext.Core (Lang)
36 import Gargantext.Prelude
38 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
39 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
40 import Gargantext.Viz.Graph (Graph(..), data2graph)
41 import Gargantext.Text.Metrics.Count (cooc)
42 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
43 import Gargantext.Text.Terms (TermType, extractTerms)
44 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
46 import Gargantext.Text.Parsers.CSV
48 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
52 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
53 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
54 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
55 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
60 contextText :: [T.Text]
61 contextText = map T.pack ["The dog is an animal."
62 ,"The bird is an animal."
63 ,"The bird is an animal."
64 ,"The bird and the dog are an animal."
65 ,"The table is an object."
66 ,"The pen is an object."
67 ,"This object is a pen or a table?"
68 ,"The girl has a human body."
69 ,"The girl has a human body."
70 ,"The boy has a human body."
71 ,"The boy has a human body."
75 -- | Control the flow of text
76 data TextFlow = CSV FilePath
79 | DB Connection CorpusId
83 textFlow :: TermType Lang -> TextFlow -> IO Graph
84 textFlow termType workType = do
85 contexts <- case workType of
86 FullText path -> splitBy (Sentences 5) <$> readFile path
87 CSV path -> readCsvOn [csv_title, csv_abstract] path
88 Contexts ctxt -> pure ctxt
89 DB con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (node_hyperdata n) <> hyperdataDocumentV3_abstract (node_hyperdata n))<$> getDocumentsV3WithParentId con corpusId
92 textFlow' termType contexts
95 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
96 textFlow' termType contexts = do
97 -- Context :: Text -> [Text]
98 -- Contexts = Paragraphs n | Sentences n | Chars n
100 myterms <- extractTerms termType contexts
101 -- TermsType = Mono | Multi | MonoMulti
102 -- myterms # filter (\t -> not . elem t stopList)
103 -- # groupBy (Stem|GroupList|Ontology)
104 printDebug "terms" myterms
105 printDebug "myterms" (sum $ map length myterms)
107 -- Bulding the map list
108 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
109 -- Cooc = Map (Term, Term) Int
110 let myCooc1 = cooc myterms
111 printDebug "myCooc1 size" (M.size myCooc1)
113 -- Remove Apax: appears one time only => lighting the matrix
114 let myCooc2 = M.filter (>0) myCooc1
115 printDebug "myCooc2 size" (M.size myCooc2)
116 printDebug "myCooc2" myCooc2
119 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
120 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
126 printDebug "myCooc3 size" $ M.size myCooc3
127 printDebug "myCooc3" myCooc3
130 let (ti, _) = createIndices myCooc3
131 printDebug "ti size" $ M.size ti
134 let myCooc4 = toIndex ti myCooc3
135 printDebug "myCooc4 size" $ M.size myCooc4
136 printDebug "myCooc4" myCooc4
138 let matCooc = map2mat (0) (M.size ti) myCooc4
139 printDebug "matCooc shape" $ A.arrayShape matCooc
140 printDebug "matCooc" matCooc
142 -- Matrix -> Clustering
143 let distanceMat = measureConditional matCooc
144 --let distanceMat = distributional matCooc
145 printDebug "distanceMat shape" $ A.arrayShape distanceMat
146 printDebug "distanceMat" distanceMat
148 --let distanceMap = M.filter (>0) $ mat2map distanceMat
149 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
150 printDebug "distanceMap size" $ M.size distanceMap
151 printDebug "distanceMap" distanceMap
153 -- let distance = fromIndex fi distanceMap
154 -- printDebug "distance" $ M.size distance
156 partitions <- cLouvain distanceMap
157 -- Building : -> Graph -> JSON
158 printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
159 --printDebug "partitions" partitions
160 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions