]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-adaptative-phylo/Main.hs
[ngrams] add score update endpoint + sorting
[gargantext.git] / bin / gargantext-adaptative-phylo / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter binary with Adaptative Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Adaptative Phylo binaries
11 -}
12
13 {-# LANGUAGE StandaloneDeriving #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# LANGUAGE Strict #-}
16
17 module Main where
18
19 import Data.Aeson
20 import Data.ByteString.Lazy (ByteString)
21 import Data.Maybe (isJust, fromJust)
22 import Data.List (concat, nub, isSuffixOf, take)
23 import Data.String (String)
24 import Data.Text (Text, unwords, unpack)
25
26 import Gargantext.Prelude
27 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
28 import Gargantext.Core.Text.Context (TermList)
29 import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year)
30 import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
31 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
32 import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
33 import Gargantext.Core.Viz.AdaptativePhylo
34 import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo)
35 import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment)
36 import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
37 -- import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicDistance')
38
39 import GHC.IO (FilePath)
40 import Prelude (Either(..))
41 import System.Environment
42 import System.Directory (listDirectory)
43 import Control.Concurrent.Async (mapConcurrently)
44
45 import qualified Data.ByteString.Lazy as Lazy
46 import qualified Data.Vector as Vector
47 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
48
49
50 ---------------
51 -- | Tools | --
52 ---------------
53
54
55 -- | To get all the files in a directory or just a file
56 getFilesFromPath :: FilePath -> IO([FilePath])
57 getFilesFromPath path = do
58 if (isSuffixOf "/" path)
59 then (listDirectory path)
60 else return [path]
61
62
63 --------------
64 -- | Json | --
65 --------------
66
67
68 -- | To read and decode a Json file
69 readJson :: FilePath -> IO ByteString
70 readJson path = Lazy.readFile path
71
72
73 ----------------
74 -- | Parser | --
75 ----------------
76
77 -- | To filter the Ngrams of a document based on the termList
78 filterTerms :: Patterns -> (a, Text) -> (a, [Text])
79 filterTerms patterns (y,d) = (y,termsInText patterns d)
80 where
81 --------------------------------------
82 termsInText :: Patterns -> Text -> [Text]
83 termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList pats txt
84 --------------------------------------
85
86
87 -- | To transform a Wos file (or [file]) into a readable corpus
88 wosToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
89 wosToCorpus limit path = do
90 files <- getFilesFromPath path
91 take limit
92 <$> map (\d -> let date' = fromJust $ _hd_publication_year d
93 title = fromJust $ _hd_title d
94 abstr = if (isJust $ _hd_abstract d)
95 then fromJust $ _hd_abstract d
96 else ""
97 in (date', title <> " " <> abstr))
98 <$> concat
99 <$> mapConcurrently (\file ->
100 filter (\d -> (isJust $ _hd_publication_year d)
101 && (isJust $ _hd_title d))
102 <$> parseFile WOS (path <> file) ) files
103
104
105 -- | To transform a Csv file into a readable corpus
106 csvToCorpus :: Int -> FilePath -> IO ([(Int,Text)])
107 csvToCorpus limit path = Vector.toList
108 <$> Vector.take limit
109 <$> Vector.map (\row -> (csv_publication_year row, (csv_title row) <> " " <> (csv_abstract row)))
110 <$> snd <$> Csv.readFile path
111
112
113 -- | To use the correct parser given a CorpusType
114 fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)])
115 fileToCorpus parser path = case parser of
116 Wos limit -> wosToCorpus limit path
117 Csv limit -> csvToCorpus limit path
118
119
120 -- | To parse a file into a list of Document
121 fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document]
122 fileToDocs parser path lst = do
123 corpus <- fileToCorpus parser path
124 let patterns = buildPatterns lst
125 pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
126
127
128 --------------
129 -- | Main | --
130 --------------
131
132
133 main :: IO ()
134 main = do
135
136 printIOMsg "Starting the reconstruction"
137
138 printIOMsg "Read the configuration file"
139 [args] <- getArgs
140 jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either String Config)
141
142 case jsonArgs of
143 Left err -> putStrLn err
144 Right config -> do
145
146 printIOMsg "Parse the corpus"
147 mapList <- csvMapTermList (listPath config)
148 corpus <- fileToDocs (corpusParser config) (corpusPath config) mapList
149 printIOComment (show (length corpus) <> " parsed docs from the corpus")
150
151 printIOMsg "Reconstruct the Phylo"
152
153 let phylo = toPhylo corpus mapList config
154
155 -- | probes
156
157 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
158 -- $ synchronicDistance' phylo 1
159
160 -- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
161 -- $ inflexionPoints phylo 1
162
163 printIOMsg "End of reconstruction, start the export"
164
165 let dot = toPhyloExport phylo
166
167 let clq = case (clique config) of
168 Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
169 MaxClique s -> "clique_" <> (show s)
170
171 let sensibility = case (phyloProximity config) of
172 Hamming -> undefined
173 WeightedLogJaccard s -> (show s)
174
175 let sync = case (phyloSynchrony config) of
176 ByProximityThreshold t _ _ _ -> (show t)
177 ByProximityDistribution _ _ -> undefined
178
179 -- to be improved
180 -- let br_length = case (take 1 $ exportFilter config) of
181 -- ByBranchSize t -> (show t)
182
183
184 let output = (outputPath config)
185 <> (unpack $ phyloName config)
186 <> "-" <> clq
187 <> "-level_" <> (show (phyloLevel config))
188 <> "-sens_" <> sensibility
189 -- <> "-lenght_" <> br_length
190 <> "-scale_" <> (show (_qua_granularity $ phyloQuality config))
191 <> "-sync_" <> sync
192 <> ".dot"
193
194 dotToFile output dot