]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/TextFlow.hs
[TestFlow] seems good, need to add tests on it and fix distributional distance.
[gargantext.git] / src / Gargantext / TextFlow.hs
1 {-|
2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16
17 module Gargantext.TextFlow
18 where
19
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
23
24 import Data.Maybe (catMaybes)
25 import qualified Data.Set as DS
26
27 import qualified Data.Array.Accelerate as A
28 import qualified Data.Map.Strict as M
29 ----------------------------------------------
30 import Gargantext.Database (Connection)
31
32 import Gargantext.Database.Node
33 import Gargantext.Core.Types.Node
34
35 import Gargantext.Core (Lang)
36 import Gargantext.Prelude
37
38 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
39 import Gargantext.Viz.Graph.Distances.Matrice (distributional, measureConditional)
40 import Gargantext.Viz.Graph (Graph(..), data2graph)
41 import Gargantext.Text.Metrics.Count (cooc)
42 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
43 import Gargantext.Text.Terms (TermType, extractTerms)
44 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
45
46 import Gargantext.Text.Parsers.CSV
47
48 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
49
50 {-
51 ____ _ _
52 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
53 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
54 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
55 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
56 |___/
57 -}
58
59
60 contextText :: [T.Text]
61 contextText = ["The dog is an animal."
62 ,"The bird is an animal."
63 ,"The bird is an animal."
64 ,"The bird and the dog are an animal."
65 ,"The table is an object."
66 ,"The pen is an object."
67 ,"This object is a pen or a table?"
68 ,"The girl has a human body."
69 ,"The girl has a human body."
70 ,"The boy has a human body."
71 ,"The boy has a human body."
72 ]
73
74
75
76 data TextFlow = CSV FilePath
77 | FullText FilePath
78 | Contexts [T.Text]
79 | DB Connection CorpusId
80 | Query T.Text
81 -- ExtDatabase Query
82 -- IntDatabase NodeId
83
84 textFlow :: TermType Lang -> TextFlow -> IO Graph
85 textFlow termType workType = do
86 contexts <- case workType of
87 FullText path -> splitBy (Sentences 5) <$> readFile path
88 CSV path -> readCsvOn [csv_title, csv_abstract] path
89 Contexts ctxt -> pure ctxt
90 SQL con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (node_hyperdata n) <> hyperdataDocumentV3_abstract (node_hyperdata n))<$> getDocumentsV3WithParentId con corpusId
91 _ -> undefined
92
93 textFlow' termType contexts
94
95
96 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
97 textFlow' termType contexts = do
98 -- Context :: Text -> [Text]
99 -- Contexts = Paragraphs n | Sentences n | Chars n
100
101 myterms <- extractTerms termType contexts
102 -- TermsType = Mono | Multi | MonoMulti
103 -- myterms # filter (\t -> not . elem t stopList)
104 -- # groupBy (Stem|GroupList|Ontology)
105 printDebug "terms" myterms
106 printDebug "myterms" (sum $ map length myterms)
107
108 -- Bulding the map list
109 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
110 -- Cooc = Map (Term, Term) Int
111 let myCooc1 = cooc myterms
112 printDebug "myCooc1 size" (M.size myCooc1)
113
114 -- Remove Apax: appears one time only => lighting the matrix
115 let myCooc2 = M.filter (>0) myCooc1
116 printDebug "myCooc2 size" (M.size myCooc2)
117 printDebug "myCooc2" myCooc2
118
119
120 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
121 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
122 (InclusionSize 500 )
123 (SampleBins 10 )
124 (Clusters 3 )
125 (DefaultValue 0 )
126 ) myCooc2
127 printDebug "myCooc3 size" $ M.size myCooc3
128 printDebug "myCooc3" myCooc3
129
130 -- Cooc -> Matrix
131 let (ti, _) = createIndices myCooc3
132 printDebug "ti size" $ M.size ti
133 printDebug "ti" ti
134
135 let myCooc4 = toIndex ti myCooc3
136 printDebug "myCooc4 size" $ M.size myCooc4
137 printDebug "myCooc4" myCooc4
138
139 let matCooc = map2mat (0) (M.size ti) myCooc4
140 printDebug "matCooc shape" $ A.arrayShape matCooc
141 printDebug "matCooc" matCooc
142
143 -- Matrix -> Clustering
144 let distanceMat = measureConditional matCooc
145 --let distanceMat = distributional matCooc
146 printDebug "distanceMat shape" $ A.arrayShape distanceMat
147 printDebug "distanceMat" distanceMat
148 --
149 --let distanceMap = M.filter (>0) $ mat2map distanceMat
150 let distanceMap = M.map (\n -> 1) $ M.filter (>0) $ mat2map distanceMat
151 printDebug "distanceMap size" $ M.size distanceMap
152 printDebug "distanceMap" distanceMap
153
154 -- let distance = fromIndex fi distanceMap
155 -- printDebug "distance" $ M.size distance
156
157 partitions <- cLouvain distanceMap
158 -- Building : -> Graph -> JSON
159 printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
160 --printDebug "partitions" partitions
161 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
162
163