2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 From text to viz, all the flow of texts in Gargantext.
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
17 module Gargantext.TextFlow
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
24 import Data.Maybe (catMaybes)
25 import qualified Data.Set as DS
27 import qualified Data.Array.Accelerate as A
28 import qualified Data.Map.Strict as M
29 ----------------------------------------------
30 import Gargantext.Database (Connection)
32 import Gargantext.Database.Node
33 import Gargantext.Core.Types.Node
35 import Gargantext.Core (Lang)
36 import Gargantext.Prelude
38 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
39 import Gargantext.Viz.Graph.Distances.Matrice (distributional, measureConditional)
40 import Gargantext.Viz.Graph (Graph(..), data2graph)
41 import Gargantext.Text.Metrics.Count (cooc)
42 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
43 import Gargantext.Text.Terms (TermType, extractTerms)
44 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
46 import Gargantext.Text.Parsers.CSV
48 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
52 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
53 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
54 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
55 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
60 contextText :: [T.Text]
61 contextText = ["The dog is an animal."
62 ,"The bird is an animal."
63 ,"The bird is an animal."
64 ,"The bird and the dog are an animal."
65 ,"The table is an object."
66 ,"The pen is an object."
67 ,"This object is a pen or a table?"
68 ,"The girl has a human body."
69 ,"The girl has a human body."
70 ,"The boy has a human body."
71 ,"The boy has a human body."
76 data TextFlow = CSV FilePath
79 | DB Connection CorpusId
84 textFlow :: TermType Lang -> TextFlow -> IO Graph
85 textFlow termType workType = do
86 contexts <- case workType of
87 FullText path -> splitBy (Sentences 5) <$> readFile path
88 CSV path -> readCsvOn [csv_title, csv_abstract] path
89 Contexts ctxt -> pure ctxt
90 SQL con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (node_hyperdata n) <> hyperdataDocumentV3_abstract (node_hyperdata n))<$> getDocumentsV3WithParentId con corpusId
93 textFlow' termType contexts
96 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
97 textFlow' termType contexts = do
98 -- Context :: Text -> [Text]
99 -- Contexts = Paragraphs n | Sentences n | Chars n
101 myterms <- extractTerms termType contexts
102 -- TermsType = Mono | Multi | MonoMulti
103 -- myterms # filter (\t -> not . elem t stopList)
104 -- # groupBy (Stem|GroupList|Ontology)
105 printDebug "terms" myterms
106 printDebug "myterms" (sum $ map length myterms)
108 -- Bulding the map list
109 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
110 -- Cooc = Map (Term, Term) Int
111 let myCooc1 = cooc myterms
112 printDebug "myCooc1 size" (M.size myCooc1)
114 -- Remove Apax: appears one time only => lighting the matrix
115 let myCooc2 = M.filter (>0) myCooc1
116 printDebug "myCooc2 size" (M.size myCooc2)
117 printDebug "myCooc2" myCooc2
120 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
121 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
127 printDebug "myCooc3 size" $ M.size myCooc3
128 printDebug "myCooc3" myCooc3
131 let (ti, _) = createIndices myCooc3
132 printDebug "ti size" $ M.size ti
135 let myCooc4 = toIndex ti myCooc3
136 printDebug "myCooc4 size" $ M.size myCooc4
137 printDebug "myCooc4" myCooc4
139 let matCooc = map2mat (0) (M.size ti) myCooc4
140 printDebug "matCooc shape" $ A.arrayShape matCooc
141 printDebug "matCooc" matCooc
143 -- Matrix -> Clustering
144 let distanceMat = measureConditional matCooc
145 --let distanceMat = distributional matCooc
146 printDebug "distanceMat shape" $ A.arrayShape distanceMat
147 printDebug "distanceMat" distanceMat
149 --let distanceMap = M.filter (>0) $ mat2map distanceMat
150 let distanceMap = M.map (\n -> 1) $ M.filter (>0) $ mat2map distanceMat
151 printDebug "distanceMap size" $ M.size distanceMap
152 printDebug "distanceMap" distanceMap
154 -- let distance = fromIndex fi distanceMap
155 -- printDebug "distance" $ M.size distance
157 partitions <- cLouvain distanceMap
158 -- Building : -> Graph -> JSON
159 printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
160 --printDebug "partitions" partitions
161 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions