]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-cli/CleanCsvCorpus.hs
Merge branch 'dev-phylo' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell-gargantex...
[gargantext.git] / bin / gargantext-cli / CleanCsvCorpus.hs
1 {-|
2 Module : CleanCsvCorpus.hs
3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Given a Gargantext CSV File and its Query This script cleans and
11 compress the contexts around the main terms of the query.
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15
16 module CleanCsvCorpus where
17
18 --import GHC.IO (FilePath)
19 import Data.SearchEngine as S
20 import qualified Data.Set as S
21 import Data.Text (pack)
22 import Data.Vector (Vector)
23 import qualified Data.Vector as V
24
25 import Gargantext.Prelude
26 import Gargantext.Text.Search
27 import qualified Gargantext.Text.Corpus.Parsers.CSV as CSV
28 ------------------------------------------------------------------------
29
30 type Query = [S.Term]
31
32 filterDocs :: [DocId] -> Vector CSV.CsvGargV3 -> Vector CSV.CsvGargV3
33 filterDocs docIds = V.filter (\doc -> S.member (CSV.d_docId doc) $ S.fromList docIds )
34
35
36 main :: IO ()
37 main = do
38 let rPath = "/tmp/Gargantext_Corpus.csv"
39 let wPath = "/tmp/Gargantext_Corpus_bis.csv"
40 --let q = ["water", "scarcity", "morocco", "shortage","flood"]
41 let q = ["gratuit", "gratuité", "culture", "culturel"]
42
43 (h,csvDocs) <- CSV.readFile rPath
44
45 putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
46 putStrLn $ "Mean size of docs:" <> show ( CSV.docsSize csvDocs)
47
48 let docs = CSV.toDocs csvDocs
49 let engine = insertDocs docs initialDocSearchEngine
50 let docIds = S.query engine (map pack q)
51 let docs' = CSV.fromDocs $ filterDocs docIds (V.fromList docs)
52
53 putStrLn $ "Number of documents after:" <> show (V.length docs')
54 putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')
55
56 CSV.writeFile wPath (h, docs')