]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/TextFlow.hs
[FACTO] textflow + cosmetics.
[gargantext.git] / src / Gargantext / TextFlow.hs
1 {-|
2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16
17 module Gargantext.TextFlow
18 where
19
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
23
24 import Control.Monad.IO.Class (MonadIO)
25
26 import Data.Map.Strict (Map)
27 import qualified Data.Array.Accelerate as A
28 import qualified Data.Map.Strict as M
29 ----------------------------------------------
30 import Gargantext.Core (Lang)
31 import Gargantext.Core.Types (Label)
32 import Gargantext.Prelude
33
34 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
35 import Gargantext.Viz.Graph.Distances.Matrice (conditional)
36 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
37 import Gargantext.Text.Metrics.Count (cooc)
38 import Gargantext.Text.Metrics
39 import Gargantext.Text.Terms (TermType, extractTerms)
40 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
41
42 import Gargantext.Text.Parsers.CSV
43
44 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
45
46
47 {-
48 ____ _ _
49 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
50 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
51 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
52 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
53 |___/
54 -}
55
56 printDebug :: (Show a, MonadIO m) => [Char] -> a -> m ()
57 printDebug msg x = putStrLn $ msg <> " " <> show x
58 --printDebug _ _ = pure ()
59
60 data TextFlow = CSV FilePath
61 | FullText FilePath
62 | Contexts [T.Text]
63 | SQL Int
64 | Database T.Text
65 -- | ExtDatabase Query
66 -- | IntDatabase NodeId
67
68 textFlow :: TermType Lang -> TextFlow -> IO Graph
69 textFlow termType workType = do
70 contexts <- case workType of
71 FullText path -> splitBy (Sentences 5) <$> readFile path
72 CSV path -> readCsvOn [csv_title, csv_abstract] path
73 Contexts ctxt -> pure ctxt
74 _ -> undefined
75
76 textFlow' termType contexts
77
78
79 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
80 textFlow' termType contexts = do
81 -- Context :: Text -> [Text]
82 -- Contexts = Paragraphs n | Sentences n | Chars n
83
84 myterms <- extractTerms termType contexts
85 -- TermsType = Mono | Multi | MonoMulti
86 -- myterms # filter (\t -> not . elem t stopList)
87 -- # groupBy (Stem|GroupList|Ontology)
88 printDebug "myterms" (sum $ map length myterms)
89
90 -- Bulding the map list
91 -- compute copresences of terms
92 -- Cooc = Map (Term, Term) Int
93 let myCooc1 = cooc myterms
94 printDebug "myCooc1" (M.size myCooc1)
95
96 -- Remove Apax: appears one time only => lighting the matrix
97 let myCooc2 = M.filter (>1) myCooc1
98 printDebug "myCooc2" (M.size myCooc2)
99
100 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
101 let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 )
102 (InclusionSize 4000 )
103 (SampleBins 10 )
104 (Clusters 3 )
105 (DefaultValue 0 )
106 ) myCooc2
107 printDebug "myCooc3" $ M.size myCooc3
108
109 -- Cooc -> Matrix
110 let (ti, _) = createIndices myCooc3
111 printDebug "ti" $ M.size ti
112
113 let myCooc4 = toIndex ti myCooc3
114 printDebug "myCooc4" $ M.size myCooc4
115
116 let matCooc = map2mat (0) (M.size ti) myCooc4
117 --printDebug "matCooc" matCooc
118 -- Matrix -> Clustering
119 let distanceMat = conditional matCooc
120 -- let distanceMat = distributional matCooc
121 printDebug "distanceMat" $ A.arrayShape distanceMat
122 --printDebug "distanceMat" distanceMat
123 --
124 let distanceMap = mat2map distanceMat
125 printDebug "distanceMap" $ M.size distanceMap
126 --{-
127 -- let distance = fromIndex fi distanceMap
128 -- printDebug "distance" $ M.size distance
129 ---}
130 partitions <- cLouvain distanceMap
131 ------ | Building : -> Graph -> JSON
132 printDebug "partitions" $ length partitions
133 --printDebug "partitions" partitions
134 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
135
136 -----------------------------------------------------------
137 -- distance should not be a map since we just "toList" it (same as cLouvain)
138 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
139 -> Map (Int, Int) Double
140 -> [LouvainNode]
141 -> Graph
142 data2graph labels coocs distance partitions = Graph nodes edges
143 where
144 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
145 nodes = [ Node { node_size = maybe 0 identity (M.lookup (n,n) coocs)
146 , node_type = Terms -- or Unknown
147 , node_id = cs (show n)
148 , node_label = T.unwords l
149 , node_attributes =
150 Attributes { clust_default = maybe 0 identity
151 (M.lookup n community_id_by_node_id) } }
152 | (l, n) <- labels ]
153 edges = [ Edge { edge_source = cs (show s)
154 , edge_target = cs (show t)
155 , edge_weight = w
156 , edge_id = cs (show i) }
157 | (i, ((s,t), w)) <- zip ([0..]::[Integer]) (M.toList distance) ]
158 -----------------------------------------------------------
159