]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/TextFlow.hs
[SPECS] index a corpus with term list.
[gargantext.git] / src / Gargantext / TextFlow.hs
1 {-|
2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16
17 module Gargantext.TextFlow
18 where
19
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
23
24 import Control.Monad.IO.Class (MonadIO)
25
26 import Data.Map.Strict (Map)
27 import qualified Data.Array.Accelerate as A
28 import qualified Data.Map.Strict as M
29 ----------------------------------------------
30 import Gargantext.Core (Lang(FR))
31 import Gargantext.Core.Types (Label)
32 import Gargantext.Prelude
33
34 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
35 import Gargantext.Viz.Graph.Distances.Matrice (conditional)
36 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
37 import Gargantext.Text.Metrics.Count (cooc)
38 import Gargantext.Text.Metrics
39 import Gargantext.Text.Terms (TermType(Mono), extractTerms)
40 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
41
42 import Gargantext.Text.Parsers.CSV
43
44 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
45
46
47 {-
48 ____ _ _
49 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
50 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
51 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
52 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
53 |___/
54
55 -}
56
57 printDebug :: (Show a, MonadIO m) => [Char] -> a -> m ()
58 printDebug msg x = putStrLn $ msg <> " " <> show x
59 --printDebug _ _ = pure ()
60
61 data TextFlow = CSV | FullText
62
63 -- workflow :: Lang (EN|FR) -> FilePath -> Graph
64 textflow :: Lang -> TextFlow -> FilePath -> IO Graph
65 textflow _ workType path = do
66 -- Text <- IO Text <- FilePath
67 contexts <- case workType of
68 FullText -> splitBy (Sentences 5) <$> readFile path
69 CSV -> readCsvOn [csv_title, csv_abstract] path
70
71 -- Context :: Text -> [Text]
72 -- Contexts = Paragraphs n | Sentences n | Chars n
73
74 myterms <- extractTerms (Mono FR) contexts
75 -- TermsType = Mono | Multi | MonoMulti
76 -- myterms # filter (\t -> not . elem t stopList)
77 -- # groupBy (Stem|GroupList|Ontology)
78 printDebug "myterms" (sum $ map length myterms)
79
80 -- Bulding the map list
81 -- compute copresences of terms
82 -- Cooc = Map (Term, Term) Int
83 let myCooc1 = cooc myterms
84 printDebug "myCooc1" (M.size myCooc1)
85
86 -- Remove Apax: appears one time only => lighting the matrix
87 let myCooc2 = M.filter (>1) myCooc1
88 printDebug "myCooc2" (M.size myCooc2)
89
90 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
91 let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 )
92 (InclusionSize 4000 )
93 (SampleBins 10 )
94 (Clusters 3 )
95 (DefaultValue 0 )
96 ) myCooc2
97 printDebug "myCooc3" $ M.size myCooc3
98
99 -- Cooc -> Matrix
100 let (ti, _) = createIndices myCooc3
101 printDebug "ti" $ M.size ti
102
103 let myCooc4 = toIndex ti myCooc3
104 printDebug "myCooc4" $ M.size myCooc4
105
106 let matCooc = map2mat (0) (M.size ti) myCooc4
107 --printDebug "matCooc" matCooc
108 -- Matrix -> Clustering
109 let distanceMat = conditional matCooc
110 -- let distanceMat = distributional matCooc
111 printDebug "distanceMat" $ A.arrayShape distanceMat
112 --printDebug "distanceMat" distanceMat
113 --
114 let distanceMap = mat2map distanceMat
115 printDebug "distanceMap" $ M.size distanceMap
116 --{-
117 -- let distance = fromIndex fi distanceMap
118 -- printDebug "distance" $ M.size distance
119 ---}
120 partitions <- cLouvain distanceMap
121 ------ | Building : -> Graph -> JSON
122 printDebug "partitions" $ length partitions
123 --printDebug "partitions" partitions
124 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
125
126
127 -----------------------------------------------------------
128 -- distance should not be a map since we just "toList" it (same as cLouvain)
129 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
130 -> Map (Int, Int) Double
131 -> [LouvainNode]
132 -> Graph
133 data2graph labels coocs distance partitions = Graph nodes edges
134 where
135 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
136 nodes = [ Node { node_size = maybe 0 identity (M.lookup (n,n) coocs)
137 , node_type = Terms -- or Unknown
138 , node_id = cs (show n)
139 , node_label = T.unwords l
140 , node_attributes =
141 Attributes { clust_default = maybe 0 identity
142 (M.lookup n community_id_by_node_id) } }
143 | (l, n) <- labels ]
144 edges = [ Edge { edge_source = cs (show s)
145 , edge_target = cs (show t)
146 , edge_weight = w
147 , edge_id = cs (show i) }
148 | (i, ((s,t), w)) <- zip ([0..]::[Integer]) (M.toList distance) ]
149 -----------------------------------------------------------
150