]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-cli/Main.hs
[ngrams] fixes for list JSON import
[gargantext.git] / bin / gargantext-cli / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Main specifications to index a corpus with a term list
11
12 -}
13
14 {-# LANGUAGE StandaloneDeriving #-}
15 {-# LANGUAGE TypeOperators #-}
16 {-# LANGUAGE Strict #-}
17
18 module Main where
19
20 import Control.Concurrent.Async as CCA (mapConcurrently)
21 import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
22 import Control.Monad (zipWithM)
23 import Control.Monad.IO.Class
24 import Data.Aeson
25 import Data.ByteString.Lazy (writeFile)
26 import Data.Either (Either(..))
27 import Data.List (cycle, concat, unwords)
28 import Data.List.Split (chunksOf)
29 import Data.Map (Map)
30 import qualified Data.Map as DM
31 import Data.Text (pack, Text)
32 import qualified Data.Text as DT
33 import Data.Tuple.Extra (both)
34 import qualified Data.Vector as DV
35 import GHC.Generics
36 import System.IO (hPutStr, hFlush, stderr)
37 import System.Environment
38
39 import Gargantext.Prelude
40 import Gargantext.Core
41 import Gargantext.Core.Types
42 import Gargantext.Core.Text.Terms
43 import Gargantext.Core.Text.Context
44 import Gargantext.Core.Text.Terms.WithList
45 import Gargantext.Core.Text.Corpus.Parsers.CSV (readCSVFile, csv_title, csv_abstract, csv_publication_year, unIntOrDec, fromMIntOrDec, defaultYear)
46 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
47 import Gargantext.Core.Text.Terms (terms)
48 import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
49
50 ------------------------------------------------------------------------
51 -- OUTPUT format
52
53 data CoocByYear = CoocByYear { year :: Int
54 , nbContexts :: NbContexts
55 , coocurrences :: Map (Text, Text) Coocs
56 } deriving (Show, Generic)
57
58 data CoocByYears = CoocByYears { years :: [CoocByYear] }
59 deriving (Show, Generic)
60
61 type NbContexts = Int
62
63 instance ToJSON CoocByYear
64 instance ToJSON CoocByYears
65 ------------------------------------------------------------------------
66
67 filterTermsAndCooc
68 :: Patterns
69 -> (Int, [Text])
70 -> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
71 filterTermsAndCooc patterns (year, ts) = do
72 log "start"
73 r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
74 log "stop"
75 pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
76 where
77
78 log m = do
79 tid <- myThreadId
80 (p, _) <- threadCapability tid
81 putStrLn . unwords $
82 ["filterTermsAndCooc:", m, show year, "on proc", show p]
83
84 main :: IO ()
85 main = do
86 [corpusFile, termListFile, outputFile] <- getArgs
87
88 --corpus :: IO (DM.IntMap [[Text]])
89 eCorpusFile <- readCSVFile corpusFile
90 case eCorpusFile of
91 Right cf -> do
92 let corpus = DM.fromListWith (<>)
93 . DV.toList
94 . DV.map (\n -> (fromMIntOrDec defaultYear $ csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
95 . snd $ cf
96
97 -- termListMap :: [Text]
98 termList <- csvMapTermList termListFile
99
100 putStrLn $ show $ length termList
101
102 let patterns = buildPatterns termList
103
104 -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
105 r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
106 writeFile outputFile $ encode (CoocByYears r)
107 Left e -> panic $ "Error: " <> (pack e)
108
109
110
111 ------------------------------------------------------------------------
112 -- | Tools
113 mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
114 mapMP f xs = do
115 bs <- zipWithM g (cycle "-\\|/") xs
116 liftIO $ hPutStr stderr "\rDone\n"
117 pure bs
118 where
119 g c x = do
120 liftIO $ hPutStr stderr ['\r',c]
121 liftIO $ hFlush stderr
122 f x
123
124 -- | Optimi that need further developments (not used yet)
125 mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
126 mapConcurrentlyChunked f ts = do
127 caps <- getNumCapabilities
128 let n = 1 `max` (length ts `div` caps)
129 concat <$> mapConcurrently (mapM f) (chunksOf n ts)
130
131
132 --terms' :: Patterns -> Text -> Corpus [[Text]]
133 terms' pats txt = pure $ concat $ extractTermsWithList pats txt
134
135
136 -- | TODO Minimal Example
137 --testCooc = do
138 -- let patterns = buildPatterns testTermList
139 -- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
140 -- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
141
142
143 testCorpus :: [(Int, [Text])]
144 testCorpus = [ (1998, [pack "The beees"])
145 , (1999, [ pack "The bees and the flowers"
146 --, pack "The bees and the flowers"
147 ])
148 ]
149
150 testTermList :: TermList
151 testTermList = [ ([pack "bee"], [[pack "bees"]])
152 , ([pack "flower"], [[pack "flowers"]])
153 ]
154