]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-cli/CleanCsvCorpus.hs
[REFACT] Group fun and types
[gargantext.git] / bin / gargantext-cli / CleanCsvCorpus.hs
1 {-|
2 Module : CleanCsvCorpus.hs
3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Given a Gargantext CSV File and its Query This script cleans and
11 compress the contexts around the main terms of the query.
12 -}
13
14
15 module CleanCsvCorpus where
16
17 --import GHC.IO (FilePath)
18 import Data.SearchEngine as S
19 import qualified Data.Set as S
20 import Data.Text (pack)
21 import Data.Vector (Vector)
22 import qualified Data.Vector as V
23
24 import Gargantext.Prelude
25 import Gargantext.Core.Text.Search
26 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
27 ------------------------------------------------------------------------
28
29 type Query = [S.Term]
30
31 filterDocs :: [DocId] -> Vector CSV.CsvGargV3 -> Vector CSV.CsvGargV3
32 filterDocs docIds = V.filter (\doc -> S.member (CSV.d_docId doc) $ S.fromList docIds )
33
34
35 main :: IO ()
36 main = do
37 let rPath = "/tmp/Gargantext_Corpus.csv"
38 let wPath = "/tmp/Gargantext_Corpus_bis.csv"
39 --let q = ["water", "scarcity", "morocco", "shortage","flood"]
40 let q = ["gratuit", "gratuité", "culture", "culturel"]
41
42 (h,csvDocs) <- CSV.readFile rPath
43
44 putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
45 putStrLn $ "Mean size of docs:" <> show ( CSV.docsSize csvDocs)
46
47 let docs = CSV.toDocs csvDocs
48 let engine = insertDocs docs initialDocSearchEngine
49 let docIds = S.query engine (map pack q)
50 let docs' = CSV.fromDocs $ filterDocs docIds (V.fromList docs)
51
52 putStrLn $ "Number of documents after:" <> show (V.length docs')
53 putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')
54
55 CSV.writeFile wPath (h, docs')