]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-cli/CleanCsvCorpus.hs
[FIX] repo file snapshots.
[gargantext.git] / bin / gargantext-cli / CleanCsvCorpus.hs
1 {-|
2 Module : CleanCsvCorpus.hs
3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Given a Gargantext CSV File and its Query This script cleans and
11 compress the contexts around the main terms of the query.
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15
16 module CleanCsvCorpus where
17
18 --import GHC.IO (FilePath)
19 import Data.SearchEngine as S
20 import qualified Data.Set as S
21 import Data.Text (pack)
22 import Data.Vector (Vector)
23 import qualified Data.Vector as V
24
25 import Gargantext.Prelude
26 import Gargantext.Text.Search
27 import Gargantext.Text.Parsers.CSV
28
29 ------------------------------------------------------------------------
30
31 type Query = [S.Term]
32
33 filterDocs :: [DocId] -> Vector Doc -> Vector Doc
34 filterDocs docIds = V.filter (\doc -> S.member (d_docId doc) $ S.fromList docIds )
35
36
37 main :: IO ()
38 main = do
39 let rPath = "/tmp/Gargantext_Corpus.csv"
40 let wPath = "/tmp/Gargantext_Corpus_bis.csv"
41 --let q = ["water", "scarcity", "morocco", "shortage","flood"]
42 let q = ["gratuit", "gratuité", "culture", "culturel"]
43
44 (h,csvDocs) <- readCsv rPath
45
46 putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
47 putStrLn $ "Mean size of docs:" <> show ( docsSize csvDocs)
48
49 let docs = toDocs csvDocs
50 let engine = insertDocs docs initialDocSearchEngine
51 let docIds = S.query engine (map pack q)
52 let docs' = fromDocs $ filterDocs docIds (V.fromList docs)
53
54 putStrLn $ "Number of documents after:" <> show (V.length docs')
55 putStrLn $ "Mean size of docs:" <> show (docsSize docs')
56
57 writeCsv wPath (h, docs')