2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 From text to viz, all the flow of texts in Gargantext.
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
17 module Gargantext.TextFlow
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
24 import Data.Maybe (catMaybes)
25 import qualified Data.Set as DS
27 import qualified Data.Array.Accelerate as A
28 import qualified Data.Map.Strict as M
29 ----------------------------------------------
30 import Gargantext.Database (Connection)
32 import Gargantext.Database.Node
33 import Gargantext.Database.Types.Node
35 import Gargantext.Core (Lang)
36 import Gargantext.Prelude
38 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
39 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
40 import Gargantext.Viz.Graph (Graph(..), data2graph)
41 import Gargantext.Text.Metrics.Count (cooc)
42 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
43 import Gargantext.Text.Terms (TermType, extractTerms)
44 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
46 import Gargantext.Text.Parsers.CSV
48 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
52 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
53 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
54 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
55 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
60 contextText :: [T.Text]
61 contextText = map T.pack ["The dog is an animal."
62 ,"The bird is a animal."
63 ,"The dog is a animal."
64 ,"The animal is a bird or a dog ?"
65 ,"The table is a object."
66 ,"The pen is a object."
67 ,"The object is a pen or a table ?"
68 ,"The girl is human body."
69 ,"The boy is human body."
70 ,"The boy or the girl are human body."
74 -- | Control the flow of text
75 data TextFlow = CSV FilePath
78 | DB Connection CorpusId
82 textFlow :: TermType Lang -> TextFlow -> IO Graph
83 textFlow termType workType = do
84 contexts <- case workType of
85 FullText path -> splitBy (Sentences 5) <$> readFile path
86 CSV path -> readCsvOn [csv_title, csv_abstract] path
87 Contexts ctxt -> pure ctxt
88 DB con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (node_hyperdata n) <> hyperdataDocumentV3_abstract (node_hyperdata n))<$> getDocumentsV3WithParentId con corpusId
91 textFlow' termType contexts
94 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
95 textFlow' termType contexts = do
96 -- Context :: Text -> [Text]
97 -- Contexts = Paragraphs n | Sentences n | Chars n
99 myterms <- extractTerms termType contexts
100 -- TermsType = Mono | Multi | MonoMulti
101 -- myterms # filter (\t -> not . elem t stopList)
102 -- # groupBy (Stem|GroupList|Ontology)
103 printDebug "terms" myterms
104 printDebug "myterms" (sum $ map length myterms)
106 -- Bulding the map list
107 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
108 -- Cooc = Map (Term, Term) Int
109 let myCooc1 = cooc myterms
110 printDebug "myCooc1 size" (M.size myCooc1)
112 -- Remove Apax: appears one time only => lighting the matrix
113 let myCooc2 = M.filter (>0) myCooc1
114 printDebug "myCooc2 size" (M.size myCooc2)
115 printDebug "myCooc2" myCooc2
118 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
119 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
125 printDebug "myCooc3 size" $ M.size myCooc3
126 printDebug "myCooc3" myCooc3
129 let (ti, _) = createIndices myCooc3
130 printDebug "ti size" $ M.size ti
133 let myCooc4 = toIndex ti myCooc3
134 printDebug "myCooc4 size" $ M.size myCooc4
135 printDebug "myCooc4" myCooc4
137 let matCooc = map2mat (0) (M.size ti) myCooc4
138 printDebug "matCooc shape" $ A.arrayShape matCooc
139 printDebug "matCooc" matCooc
141 -- Matrix -> Clustering
142 let distanceMat = measureConditional matCooc
143 --let distanceMat = distributional matCooc
144 printDebug "distanceMat shape" $ A.arrayShape distanceMat
145 printDebug "distanceMat" distanceMat
147 --let distanceMap = M.filter (>0) $ mat2map distanceMat
148 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
149 printDebug "distanceMap size" $ M.size distanceMap
150 printDebug "distanceMap" distanceMap
152 -- let distance = fromIndex fi distanceMap
153 -- printDebug "distance" $ M.size distance
155 partitions <- cLouvain distanceMap
156 -- Building : -> Graph -> JSON
157 printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
158 --printDebug "partitions" partitions
159 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions