]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Flow.hs
[Database][Schema] Node Table and types without TSVector.
[gargantext.git] / src / Gargantext / Text / Flow.hs
1 {-|
2 Module : Gargantext.Text.Flow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16
17 module Gargantext.Text.Flow
18 where
19
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
23
24 import Data.Maybe (catMaybes)
25 import qualified Data.Set as DS
26
27 import qualified Data.Array.Accelerate as A
28 import qualified Data.Map.Strict as M
29 ----------------------------------------------
30 import Gargantext.Database (Connection)
31
32 import Gargantext.Database.Schema.Node
33 import Gargantext.Database.Types.Node
34
35 import Gargantext.Core (Lang)
36 import Gargantext.Prelude
37
38 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
39 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
40 import Gargantext.Viz.Graph (Graph(..), data2graph)
41 import Gargantext.Text.Metrics.Count (cooc)
42 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
43 import Gargantext.Text.Terms (TermType, extractTerms)
44 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
45 import Gargantext.Core.Types (CorpusId)
46
47 import Gargantext.Text.Parsers.CSV
48
49 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
50
51 {-
52 ____ _ _
53 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
54 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
55 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
56 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
57 |___/
58 -}
59
60
61 contextText :: [T.Text]
62 contextText = map T.pack ["The dog is an animal."
63 ,"The bird is an animal."
64 ,"The dog is an animal."
65 ,"The animal is a bird or a dog ?"
66 ,"The table is an object."
67 ,"The pen is an object."
68 ,"The object is a pen or a table ?"
69 ,"The girl is a human."
70 ,"The boy is a human."
71 ,"The boy or the girl are human."
72 ]
73
74
75 -- | Control the flow of text
76 data TextFlow = CSV FilePath
77 | FullText FilePath
78 | Contexts [T.Text]
79 | DBV3 Connection CorpusId
80 | Query T.Text
81
82
83 textFlow :: TermType Lang -> TextFlow -> IO Graph
84 textFlow termType workType = do
85 contexts <- case workType of
86 FullText path -> splitBy (Sentences 5) <$> readFile path
87 CSV path -> readCsvOn [csv_title, csv_abstract] path
88 Contexts ctxt -> pure ctxt
89 DBV3 con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> getDocumentsV3WithParentId con corpusId
90 _ -> undefined -- TODO Query not supported
91
92 textFlow' termType contexts
93
94
95 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
96 textFlow' termType contexts = do
97 -- Context :: Text -> [Text]
98 -- Contexts = Paragraphs n | Sentences n | Chars n
99
100 myterms <- extractTerms termType contexts
101 -- TermsType = Mono | Multi | MonoMulti
102 -- myterms # filter (\t -> not . elem t stopList)
103 -- # groupBy (Stem|GroupList|Ontology)
104 printDebug "terms" myterms
105 printDebug "myterms" (sum $ map length myterms)
106
107 -- Bulding the map list
108 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
109 -- Cooc = Map (Term, Term) Int
110 let myCooc1 = cooc myterms
111 printDebug "myCooc1 size" (M.size myCooc1)
112
113 -- Remove Apax: appears one time only => lighting the matrix
114 let myCooc2 = M.filter (>0) myCooc1
115 printDebug "myCooc2 size" (M.size myCooc2)
116 printDebug "myCooc2" myCooc2
117
118
119 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
120 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
121 (InclusionSize 500 )
122 (SampleBins 10 )
123 (Clusters 3 )
124 (DefaultValue 0 )
125 ) myCooc2
126 printDebug "myCooc3 size" $ M.size myCooc3
127 printDebug "myCooc3" myCooc3
128
129 -- Cooc -> Matrix
130 let (ti, _) = createIndices myCooc3
131 printDebug "ti size" $ M.size ti
132 printDebug "ti" ti
133
134 let myCooc4 = toIndex ti myCooc3
135 printDebug "myCooc4 size" $ M.size myCooc4
136 printDebug "myCooc4" myCooc4
137
138 let matCooc = map2mat (0) (M.size ti) myCooc4
139 printDebug "matCooc shape" $ A.arrayShape matCooc
140 printDebug "matCooc" matCooc
141
142 -- Matrix -> Clustering
143 let distanceMat = measureConditional matCooc
144 --let distanceMat = distributional matCooc
145 printDebug "distanceMat shape" $ A.arrayShape distanceMat
146 printDebug "distanceMat" distanceMat
147 --
148 --let distanceMap = M.filter (>0) $ mat2map distanceMat
149 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
150 printDebug "distanceMap size" $ M.size distanceMap
151 printDebug "distanceMap" distanceMap
152
153 -- let distance = fromIndex fi distanceMap
154 -- printDebug "distance" $ M.size distance
155
156 partitions <- cLouvain distanceMap
157 -- Building : -> Graph -> JSON
158 printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
159 --printDebug "partitions" partitions
160 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
161