2 Module : Gargantext.Core.Text.Corpus.Parsers.WOS
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
15 module Gargantext.Core.Text.Corpus.Parsers.Iramuteq (parseIramuteqFile, parser, keys) where
17 import Control.Applicative
18 import Data.Attoparsec.ByteString (Parser, takeTill, parseOnly)
19 import Data.Attoparsec.ByteString.Char8 (isEndOfLine, takeWhile, endOfLine)
20 import Data.ByteString (ByteString)
21 import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
22 import qualified Data.ByteString as DB
24 parseIramuteqFile :: FilePath -> IO (Either String [[(ByteString, ByteString)]])
25 parseIramuteqFile fp = do
26 txts <- DB.readFile fp
27 pure $ parseOnly parser txts
29 -------------------------------------------------------------
30 parser :: Parser [[(ByteString, ByteString)]]
35 notice :: Parser [(ByteString, ByteString)]
38 ns <- takeWhile (/= '*')
39 pure $ hs <> [("text", ns)]
41 -----------------------------------------------------------------
42 headers :: Parser [(ByteString, ByteString)]
43 headers = parseOf header fields
45 header :: Parser ByteString
46 header = "**** " *> takeTill isEndOfLine <* endOfLine
48 -----------------------------------------------------------------
49 fields :: Parser [(ByteString, ByteString)]
50 fields = many (parseOf field fieldTuple)
52 field :: Parser ByteString
53 field = "*" *> takeWhile (/= ' ') <* " "
54 <|> "*" *> takeWhile (/= '\n')
56 fieldTuple :: Parser (ByteString, ByteString)
58 name <- takeWhile (/= '_') <* "_"
59 rest <- takeWhile (/= '\n')
62 -----------------------------------------------------------------
63 constP :: Parser a -> ByteString -> Parser a
64 constP p t = case parseOnly p t of
68 parseOf :: Parser ByteString -> Parser a -> Parser a
69 parseOf ptxt pa = bothParse <|> empty
71 bothParse = ptxt >>= constP pa
73 -----------------------------------------------------------------
74 -- These keys may not be constant for Iramuteq files formats
75 keys :: ByteString -> ByteString
78 | f == "qui" = "authors"
80 | f == "type" = "source"
81 | f == "titre" = "title"
82 | f == "ou" = "institutes"
83 | f == "text" = "abstract"