]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/TextFlow.hs
[DOC+TESTS] contexts of texts.
[gargantext.git] / src / Gargantext / TextFlow.hs
1 {-|
2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16
17 module Gargantext.TextFlow
18 where
19
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
23
24
25 import Data.Map.Strict (Map)
26 import qualified Data.Array.Accelerate as A
27 import qualified Data.Map.Strict as M
28 ----------------------------------------------
29 import Gargantext.Core (Lang)
30 import Gargantext.Core.Types (Label)
31 import Gargantext.Prelude
32
33 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
34 import Gargantext.Viz.Graph.Distances.Matrice (conditional)
35 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
36 import Gargantext.Text.Metrics.Count (cooc)
37 import Gargantext.Text.Metrics
38 import Gargantext.Text.Terms (TermType, extractTerms)
39 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
40
41 import Gargantext.Text.Parsers.CSV
42
43 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
44
45
46 {-
47 ____ _ _
48 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
49 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
50 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
51 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
52 |___/
53 -}
54
55
56 data TextFlow = CSV FilePath
57 | FullText FilePath
58 | Contexts [T.Text]
59 | SQL Int
60 | Database T.Text
61 -- ExtDatabase Query
62 -- IntDatabase NodeId
63
64 textFlow :: TermType Lang -> TextFlow -> IO Graph
65 textFlow termType workType = do
66 contexts <- case workType of
67 FullText path -> splitBy (Sentences 5) <$> readFile path
68 CSV path -> readCsvOn [csv_title, csv_abstract] path
69 Contexts ctxt -> pure ctxt
70 _ -> undefined
71
72 textFlow' termType contexts
73
74
75 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
76 textFlow' termType contexts = do
77 -- Context :: Text -> [Text]
78 -- Contexts = Paragraphs n | Sentences n | Chars n
79
80 myterms <- extractTerms termType contexts
81 -- TermsType = Mono | Multi | MonoMulti
82 -- myterms # filter (\t -> not . elem t stopList)
83 -- # groupBy (Stem|GroupList|Ontology)
84 printDebug "myterms" (sum $ map length myterms)
85
86 -- Bulding the map list
87 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
88 -- Cooc = Map (Term, Term) Int
89 let myCooc1 = cooc myterms
90 printDebug "myCooc1" (M.size myCooc1)
91
92 -- Remove Apax: appears one time only => lighting the matrix
93 let myCooc2 = M.filter (>1) myCooc1
94 printDebug "myCooc2" (M.size myCooc2)
95
96 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
97 let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 )
98 (InclusionSize 400 )
99 (SampleBins 10 )
100 (Clusters 3 )
101 (DefaultValue 0 )
102 ) myCooc2
103 printDebug "myCooc3" $ M.size myCooc3
104 -- putStrLn $ show myCooc3
105
106 -- Cooc -> Matrix
107 let (ti, _) = createIndices myCooc3
108 printDebug "ti" $ M.size ti
109
110 let myCooc4 = toIndex ti myCooc3
111 printDebug "myCooc4" $ M.size myCooc4
112
113 let matCooc = map2mat (0) (M.size ti) myCooc4
114 -- printDebug "matCooc" matCooc
115 -- Matrix -> Clustering
116 let distanceMat = conditional matCooc
117 -- let distanceMat = distributional matCooc
118 printDebug "distanceMat" $ A.arrayShape distanceMat
119 -- printDebug "distanceMat" distanceMat
120 --
121 let distanceMap = mat2map distanceMat
122 printDebug "distanceMap" $ M.size distanceMap
123 --{-
124 -- let distance = fromIndex fi distanceMap
125 -- printDebug "distance" $ M.size distance
126 ---}
127 partitions <- cLouvain distanceMap
128 -- Building : -> Graph -> JSON
129 printDebug "partitions" $ length partitions
130 --printDebug "partitions" partitions
131 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
132
133 -----------------------------------------------------------
134 -- | From data to Graph
135 -- FIXME: distance should not be a map since we just "toList" it (same as cLouvain)
136 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
137 -> Map (Int, Int) Double
138 -> [LouvainNode]
139 -> Graph
140 data2graph labels coocs distance partitions = Graph nodes edges
141 where
142 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
143 nodes = [ Node { node_size = maybe 0 identity (M.lookup (n,n) coocs)
144 , node_type = Terms -- or Unknown
145 , node_id = cs (show n)
146 , node_label = T.unwords l
147 , node_attributes =
148 Attributes { clust_default = maybe 0 identity
149 (M.lookup n community_id_by_node_id) } }
150 | (l, n) <- labels ]
151 edges = [ Edge { edge_source = cs (show s)
152 , edge_target = cs (show t)
153 , edge_weight = w
154 , edge_id = cs (show i) }
155 | (i, ((s,t), w)) <- zip ([0..]::[Integer]) (M.toList distance) ]
156 -----------------------------------------------------------
157