]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-cli/Main.hs
Merge remote-tracking branch 'origin/dev-auth' into dev-merge
[gargantext.git] / bin / gargantext-cli / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Main specifications to index a corpus with a term list
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import Data.ByteString.Lazy (writeFile)
26
27 import Data.Maybe (catMaybes)
28 import Data.Text (pack)
29 import qualified Data.Text as DT
30
31 import Data.Tuple.Extra (both)
32 import qualified Data.Vector as DV
33 import qualified Data.Maybe as DMaybe
34
35 import Control.Monad (zipWithM)
36 import Control.Monad.IO.Class
37
38 import Data.Map (Map)
39 import qualified Data.IntMap as DIM
40 import qualified Data.Map as DM
41
42 import GHC.Generics
43 import Data.Aeson
44
45 import Data.Text (Text)
46 import Data.List (cycle, concat, unwords)
47 import Data.List.Split (chunksOf)
48 import System.IO (hPutStr, hFlush, stderr)
49 import System.Environment
50 import Control.Concurrent.Async as CCA (mapConcurrently)
51 import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
52 import Prelude ((>>))
53
54 import Gargantext.Prelude
55 import Gargantext.Core
56 import Gargantext.Core.Types
57 import Gargantext.Text.Terms
58 import Gargantext.Text.Context
59 import Gargantext.Text.Terms.WithList
60 import Gargantext.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year)
61 import Gargantext.Text.List.CSV (csvGraphTermList)
62 import Gargantext.Text.Terms (terms)
63 import Gargantext.Text.Metrics.Count (coocOnContexts, Coocs)
64
65 ------------------------------------------------------------------------
66 -- OUTPUT format
67
68 data CoocByYear = CoocByYear { year :: Int
69 , nbContexts :: NbContexts
70 , coocurrences :: Map (Text, Text) Coocs
71 } deriving (Show, Generic)
72
73 data CoocByYears = CoocByYears { years :: [CoocByYear] }
74 deriving (Show, Generic)
75
76 type NbContexts = Int
77
78 instance ToJSON CoocByYear
79 instance ToJSON CoocByYears
80 ------------------------------------------------------------------------
81
82 filterTermsAndCooc
83 :: Patterns
84 -> (Int, [Text])
85 -> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
86 filterTermsAndCooc patterns (year, ts) = do
87 log "start"
88 r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
89 log "stop"
90 pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
91 where
92
93 log m = do
94 tid <- myThreadId
95 (p, _) <- threadCapability tid
96 putStrLn . unwords $
97 ["filterTermsAndCooc:", m, show year, "on proc", show p]
98
99 main :: IO ()
100 main = do
101 [corpusFile, termListFile, outputFile] <- getArgs
102
103 --corpus :: IO (DM.IntMap [[Text]])
104 corpus <- DM.fromListWith (<>)
105 . DV.toList
106 . DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
107 . snd
108 <$> readFile corpusFile
109
110 -- termListMap :: [Text]
111 termList <- csvGraphTermList termListFile
112
113 putStrLn $ show $ length termList
114
115 let patterns = buildPatterns termList
116
117 -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
118 r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
119 writeFile outputFile $ encode (CoocByYears r)
120
121
122
123 ------------------------------------------------------------------------
124 -- | Tools
125 mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
126 mapMP f xs = do
127 bs <- zipWithM g (cycle "-\\|/") xs
128 liftIO $ hPutStr stderr "\rDone\n"
129 pure bs
130 where
131 g c x = do
132 liftIO $ hPutStr stderr ['\r',c]
133 liftIO $ hFlush stderr
134 f x
135
136 -- | Optimi that need further developments (not used yet)
137 mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
138 mapConcurrentlyChunked f ts = do
139 caps <- getNumCapabilities
140 let n = 1 `max` (length ts `div` caps)
141 concat <$> mapConcurrently (mapM f) (chunksOf n ts)
142
143
144 --terms' :: Patterns -> Text -> Corpus [[Text]]
145 terms' pats txt = pure $ concat $ extractTermsWithList pats txt
146
147
148 -- | TODO Minimal Example
149 --testCooc = do
150 -- let patterns = buildPatterns testTermList
151 -- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
152 -- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
153
154
155 testCorpus :: [(Int, [Text])]
156 testCorpus = [ (1998, [pack "The beees"])
157 , (1999, [ pack "The bees and the flowers"
158 --, pack "The bees and the flowers"
159 ])
160 ]
161
162 testTermList :: TermList
163 testTermList = [ ([pack "bee"], [[pack "bees"]])
164 , ([pack "flower"], [[pack "flowers"]])
165 ]
166