]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/TextFlow.hs
[FACTO] moving data2graph from textflow to Viz.Graph.
[gargantext.git] / src / Gargantext / TextFlow.hs
1 {-|
2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16
17 module Gargantext.TextFlow
18 where
19
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
23
24
25 import qualified Data.Array.Accelerate as A
26 import qualified Data.Map.Strict as M
27 ----------------------------------------------
28 import Gargantext.Core (Lang)
29 import Gargantext.Prelude
30
31 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
32 import Gargantext.Viz.Graph.Distances.Matrice (conditional)
33 import Gargantext.Viz.Graph (Graph(..), data2graph)
34 import Gargantext.Text.Metrics.Count (cooc)
35 import Gargantext.Text.Metrics
36 import Gargantext.Text.Terms (TermType, extractTerms)
37 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
38
39 import Gargantext.Text.Parsers.CSV
40
41 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
42
43
44 {-
45 ____ _ _
46 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
47 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
48 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
49 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
50 |___/
51 -}
52
53
54 data TextFlow = CSV FilePath
55 | FullText FilePath
56 | Contexts [T.Text]
57 | SQL Int
58 | Database T.Text
59 -- ExtDatabase Query
60 -- IntDatabase NodeId
61
62 textFlow :: TermType Lang -> TextFlow -> IO Graph
63 textFlow termType workType = do
64 contexts <- case workType of
65 FullText path -> splitBy (Sentences 5) <$> readFile path
66 CSV path -> readCsvOn [csv_title, csv_abstract] path
67 Contexts ctxt -> pure ctxt
68 _ -> undefined
69
70 textFlow' termType contexts
71
72
73 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
74 textFlow' termType contexts = do
75 -- Context :: Text -> [Text]
76 -- Contexts = Paragraphs n | Sentences n | Chars n
77
78 myterms <- extractTerms termType contexts
79 -- TermsType = Mono | Multi | MonoMulti
80 -- myterms # filter (\t -> not . elem t stopList)
81 -- # groupBy (Stem|GroupList|Ontology)
82 printDebug "myterms" (sum $ map length myterms)
83
84 -- Bulding the map list
85 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
86 -- Cooc = Map (Term, Term) Int
87 let myCooc1 = cooc myterms
88 printDebug "myCooc1" (M.size myCooc1)
89
90 -- Remove Apax: appears one time only => lighting the matrix
91 let myCooc2 = M.filter (>1) myCooc1
92 printDebug "myCooc2" (M.size myCooc2)
93
94 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
95 let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 )
96 (InclusionSize 400 )
97 (SampleBins 10 )
98 (Clusters 3 )
99 (DefaultValue 0 )
100 ) myCooc2
101 printDebug "myCooc3" $ M.size myCooc3
102 -- putStrLn $ show myCooc3
103
104 -- Cooc -> Matrix
105 let (ti, _) = createIndices myCooc3
106 printDebug "ti" $ M.size ti
107
108 let myCooc4 = toIndex ti myCooc3
109 printDebug "myCooc4" $ M.size myCooc4
110
111 let matCooc = map2mat (0) (M.size ti) myCooc4
112 -- printDebug "matCooc" matCooc
113 -- Matrix -> Clustering
114 let distanceMat = conditional matCooc
115 -- let distanceMat = distributional matCooc
116 printDebug "distanceMat" $ A.arrayShape distanceMat
117 -- printDebug "distanceMat" distanceMat
118 --
119 let distanceMap = mat2map distanceMat
120 printDebug "distanceMap" $ M.size distanceMap
121 --{-
122 -- let distance = fromIndex fi distanceMap
123 -- printDebug "distance" $ M.size distance
124 ---}
125 partitions <- cLouvain distanceMap
126 -- Building : -> Graph -> JSON
127 printDebug "partitions" $ length partitions
128 --printDebug "partitions" partitions
129 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
130
131