]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-cli/CleanCsvCorpus.hs
Changelog
[gargantext.git] / bin / gargantext-cli / CleanCsvCorpus.hs
1 {-|
2 Module : CleanCsvCorpus.hs
3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Given a Gargantext CSV File and its Query This script cleans and
11 compress the contexts around the main terms of the query.
12 -}
13
14
15 module CleanCsvCorpus where
16
17 --import GHC.IO (FilePath)
18 import Data.Either (Either(..))
19 import Data.SearchEngine as S
20 import qualified Data.Set as S
21 import Data.Text (pack)
22 import Data.Vector (Vector)
23 import qualified Data.Vector as V
24
25 import Gargantext.Prelude
26 import Gargantext.Core.Text.Search
27 import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
28 ------------------------------------------------------------------------
29
30 type Query = [S.Term]
31
32 filterDocs :: [DocId] -> Vector CSV.CsvGargV3 -> Vector CSV.CsvGargV3
33 filterDocs docIds = V.filter (\doc -> S.member (CSV.d_docId doc) $ S.fromList docIds )
34
35
36 main :: IO ()
37 main = do
38 let rPath = "/tmp/Gargantext_Corpus.csv"
39 let wPath = "/tmp/Gargantext_Corpus_bis.csv"
40 --let q = ["water", "scarcity", "morocco", "shortage","flood"]
41 let q = ["gratuit", "gratuité", "culture", "culturel"]
42
43 eDocs <- CSV.readFile rPath
44 case eDocs of
45 Right (h, csvDocs) -> do
46 putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
47 putStrLn $ "Mean size of docs:" <> show ( CSV.docsSize csvDocs)
48
49 let docs = CSV.toDocs csvDocs
50 let engine = insertDocs docs initialDocSearchEngine
51 let docIds = S.query engine (map pack q)
52 let docs' = CSV.fromDocs $ filterDocs docIds (V.fromList docs)
53
54 putStrLn $ "Number of documents after:" <> show (V.length docs')
55 putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')
56
57 CSV.writeFile wPath (h, docs')
58 Left e -> panic $ "Error: " <> (pack e)