]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/TextFlow.hs
[DOC] distances.
[gargantext.git] / src / Gargantext / TextFlow.hs
1 {-|
2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16
17 module Gargantext.TextFlow
18 where
19
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
23
24
25 import qualified Data.Array.Accelerate as A
26 import qualified Data.Map.Strict as M
27 ----------------------------------------------
28 import Gargantext.Core (Lang)
29 import Gargantext.Prelude
30
31 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
32 import Gargantext.Viz.Graph.Distances.Matrice (distributional)
33 import Gargantext.Viz.Graph (Graph(..), data2graph)
34 import Gargantext.Text.Metrics.Count (cooc)
35 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
36 import Gargantext.Text.Terms (TermType, extractTerms)
37 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
38
39 import Gargantext.Text.Parsers.CSV
40
41 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
42
43 {-
44 ____ _ _
45 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
46 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
47 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
48 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
49 |___/
50 -}
51
52
53 data TextFlow = CSV FilePath
54 | FullText FilePath
55 | Contexts [T.Text]
56 | SQL Int
57 | Database T.Text
58 -- ExtDatabase Query
59 -- IntDatabase NodeId
60
61 textFlow :: TermType Lang -> TextFlow -> IO Graph
62 textFlow termType workType = do
63 contexts <- case workType of
64 FullText path -> splitBy (Sentences 5) <$> readFile path
65 CSV path -> readCsvOn [csv_title, csv_abstract] path
66 Contexts ctxt -> pure ctxt
67 _ -> undefined
68
69 textFlow' termType contexts
70
71
72 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
73 textFlow' termType contexts = do
74 -- Context :: Text -> [Text]
75 -- Contexts = Paragraphs n | Sentences n | Chars n
76
77 myterms <- extractTerms termType contexts
78 -- TermsType = Mono | Multi | MonoMulti
79 -- myterms # filter (\t -> not . elem t stopList)
80 -- # groupBy (Stem|GroupList|Ontology)
81 printDebug "myterms" (sum $ map length myterms)
82
83 -- Bulding the map list
84 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
85 -- Cooc = Map (Term, Term) Int
86 let myCooc1 = cooc myterms
87 printDebug "myCooc1" (M.size myCooc1)
88
89 -- Remove Apax: appears one time only => lighting the matrix
90 let myCooc2 = M.filter (>1) myCooc1
91 printDebug "myCooc2" (M.size myCooc2)
92
93 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
94 let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 )
95 (InclusionSize 900 )
96 (SampleBins 10 )
97 (Clusters 3 )
98 (DefaultValue 0 )
99 ) myCooc2
100 printDebug "myCooc3" $ M.size myCooc3
101 -- putStrLn $ show myCooc3
102
103 -- Cooc -> Matrix
104 let (ti, _) = createIndices myCooc3
105 printDebug "ti" $ M.size ti
106
107 let myCooc4 = toIndex ti myCooc3
108 printDebug "myCooc4" $ M.size myCooc4
109
110 let matCooc = map2mat (0) (M.size ti) myCooc4
111 printDebug "matCooc" matCooc
112
113 -- Matrix -> Clustering
114 --let distanceMat = conditional' matCooc
115 let distanceMat = distributional matCooc
116 printDebug "distanceMat" $ A.arrayShape distanceMat
117 printDebug "distanceMat" distanceMat
118 --
119 let distanceMap = mat2map distanceMat
120 printDebug "distanceMap" $ M.size distanceMap
121
122 -- let distance = fromIndex fi distanceMap
123 -- printDebug "distance" $ M.size distance
124
125 partitions <- cLouvain distanceMap
126 -- Building : -> Graph -> JSON
127 printDebug "partitions" $ length partitions
128 --printDebug "partitions" partitions
129 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
130
131