]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/Iramuteq.hs
[VERSION] +1 to 0.0.6.9.9.4.3
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / Iramuteq.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.WOS
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12 -}
13
14
15 module Gargantext.Core.Text.Corpus.Parsers.Iramuteq (parseIramuteqFile, parser, keys) where
16
17 import Control.Applicative
18 import Data.Attoparsec.ByteString (Parser, takeTill, parseOnly)
19 import Data.Attoparsec.ByteString.Char8 (isEndOfLine, takeWhile, endOfLine)
20 import Data.ByteString (ByteString)
21 import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
22 import qualified Data.ByteString as DB
23
24 parseIramuteqFile :: FilePath -> IO (Either String [[(ByteString, ByteString)]])
25 parseIramuteqFile fp = do
26 txts <- DB.readFile fp
27 pure $ parseOnly parser txts
28
29 -------------------------------------------------------------
30 parser :: Parser [[(ByteString, ByteString)]]
31 parser = do
32 ns <- (many notice)
33 pure ns
34
35 notice :: Parser [(ByteString, ByteString)]
36 notice = do
37 hs <- headers
38 ns <- takeWhile (/= '*')
39 pure $ hs <> [("text", ns)]
40
41 -----------------------------------------------------------------
42 headers :: Parser [(ByteString, ByteString)]
43 headers = parseOf header fields
44
45 header :: Parser ByteString
46 header = "**** " *> takeTill isEndOfLine <* endOfLine
47
48 -----------------------------------------------------------------
49 fields :: Parser [(ByteString, ByteString)]
50 fields = many (parseOf field fieldTuple)
51
52 field :: Parser ByteString
53 field = "*" *> takeWhile (/= ' ') <* " "
54 <|> "*" *> takeWhile (/= '\n')
55
56 fieldTuple :: Parser (ByteString, ByteString)
57 fieldTuple = do
58 name <- takeWhile (/= '_') <* "_"
59 rest <- takeWhile (/= '\n')
60 pure (name,rest)
61
62 -----------------------------------------------------------------
63 constP :: Parser a -> ByteString -> Parser a
64 constP p t = case parseOnly p t of
65 Left _ -> empty
66 Right a -> return a
67
68 parseOf :: Parser ByteString -> Parser a -> Parser a
69 parseOf ptxt pa = bothParse <|> empty
70 where
71 bothParse = ptxt >>= constP pa
72
73 -----------------------------------------------------------------
74 -- These keys may not be constant for Iramuteq files formats
75 keys :: ByteString -> ByteString
76 keys f
77 | f == "id" = "doi"
78 | f == "qui" = "authors"
79 | f == "quand" = "PY"
80 | f == "type" = "source"
81 | f == "titre" = "title"
82 | f == "ou" = "institutes"
83 | f == "text" = "abstract"
84 | otherwise = f