]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs
[framewrite] better line parsing
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / FrameWrite.hs
1 module Gargantext.Core.Text.Corpus.Parsers.FrameWrite where
2
3 import Control.Applicative ((*>))
4 import Control.Monad (void)
5 import Data.Maybe
6 import Data.Text
7 import Gargantext.Prelude
8 import Prelude (String, (++))
9 import Text.Parsec hiding (Line)
10 import Text.Parsec.Combinator
11 import Text.Parsec.String
12
13
14 -- https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/331
15
16 -- title : everything above the first ==
17 -- Authors : default : anonymous ; except if the following line is encountered ^@@authors: FirstName1, LastName1 ; FirstName2, LastName2 ; etc.
18 -- date : default : date of last change except if the following line is encountered ^@@date: 2021-09-10
19 -- source: Name of the root node except if the following line is encountered ^@@source:
20 -- By default, 1 framawrite node = 1 document. Option for further developments: allow to give a level at generation for the split within framawrite node : :
21 --
22 -- par défaut: un doc == 1 NodeWrite
23 -- ## mean each ## section will be a new document with title the subsubsection title. Either it features options for author, date etc. or it will inherit the document's option.
24
25 sample =
26 unlines
27 [ "title1"
28 , "title2"
29 , "=="
30 , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
31 , "^@@date: 2021-09-10"
32 , "^@@source: someSource"
33 , "document contents 1"
34 , "document contents 2"
35 ]
36
37 sampleUnordered =
38 unlines
39 [ "title1"
40 , "title2"
41 , "=="
42 , "document contents 1"
43 , "^@@date: 2021-09-10"
44 , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
45 , "^@@source: someSource"
46 , "document contents 2"
47 ]
48
49 parseSample = parse documentP "sample" (unpack sample)
50 parseSampleUnordered = parse documentP "sampleUnordered" (unpack sampleUnordered)
51 parseLinesSample = parse documentLinesP "sample" (unpack sample)
52 parseLinesSampleUnordered = parse documentLinesP "sampleUnordered" (unpack sampleUnordered)
53
54 data Author =
55 Author { firstName :: Text
56 , lastName :: Text }
57 deriving (Show)
58
59 data Parsed =
60 Parsed { title :: Text
61 , authors :: [Author]
62 , date :: Maybe Text
63 , source :: Maybe Text
64 , contents :: Text }
65 deriving (Show)
66
67 emptyParsed =
68 Parsed { title = ""
69 , authors = []
70 , date = Nothing
71 , source = Nothing
72 , contents = "" }
73
74 data Line =
75 LAuthors [Author]
76 | LDate Text
77 | LSource Text
78 | LContents Text
79 | LTitle Text
80 deriving (Show)
81
82 parseLines :: Text -> Parsed
83 parseLines text = foldl f emptyParsed lst
84 where
85 lst = parse documentLinesP "" (unpack text)
86 f (Parsed { .. }) (LAuthors as) = Parsed { authors = as, .. }
87 f (Parsed { .. }) (LDate d ) = Parsed { date = d, .. }
88 f (Parsed { .. }) (LSource s ) = Parsed { source = s, .. }
89 f (Parsed { .. }) (LContents c) = Parsed { contents = contents ++ c, .. }
90 f (Parsed { .. }) (LTitle t ) = Parsed { title = t, .. }
91
92 documentLinesP = do
93 t <- titleP
94 lines <- lineP `sepBy` newline
95 pure $ [LTitle $ pack t] ++ lines
96
97 lineP :: Parser Line
98 lineP = do
99 choice [ try authorsLineP
100 , try dateLineP
101 , try sourceLineP
102 , contentsLineP ]
103
104 authorsLineP = do
105 authors <- authorsP
106 pure $ LAuthors authors
107
108 dateLineP = do
109 date <- dateP
110 pure $ LDate $ pack date
111
112 sourceLineP = do
113 source <- sourceP
114 pure $ LSource $ pack source
115
116 contentsLineP = do
117 contents <- many (noneOf "\n")
118 pure $ LContents $ pack contents
119
120 --------------------
121
122 documentP = do
123 t <- titleP
124 a <- optionMaybe authorsP
125 d <- optionMaybe dateP
126 s <- optionMaybe sourceP
127 c <- contentsP
128 pure $ Parsed { title = pack t
129 , authors = fromMaybe [] a
130 , date = pack <$> d
131 , source = pack <$> s
132 , contents = pack c }
133
134 titleDelimiterP = do
135 newline
136 string "=="
137 tokenEnd
138 titleP :: Parser [Char]
139 titleP = manyTill anyChar (try titleDelimiterP)
140
141 authorsPrefixP = do
142 _ <- string "^@@authors:"
143 many (char ' ')
144 authorsP :: Parser [Author]
145 authorsP = try authorsPrefixP *> sepBy authorP (char ';')
146 authorP :: Parser Author
147 authorP = do
148 fn <- manyTill anyChar (char ',')
149 _ <- many (char ' ')
150 --ln <- manyTill anyChar (void (char ';') <|> tokenEnd)
151 --ln <- manyTill anyChar (tokenEnd)
152 ln <- many (noneOf "\n")
153 pure $ Author { firstName = pack fn, lastName = pack ln }
154 -- manyTill anyChar (void (char '\n') <|> eof)
155
156 datePrefixP = do
157 _ <- string "^@@date:"
158 many (char ' ')
159 dateP :: Parser [Char]
160 dateP = try datePrefixP
161 *> many (noneOf "\n")
162
163 sourcePrefixP = do
164 _ <- string "^@@source:"
165 many (char ' ')
166 sourceP :: Parser [Char]
167 sourceP = try sourcePrefixP
168 *> many (noneOf "\n")
169
170 contentsP :: Parser String
171 contentsP = many anyChar
172
173 tokenEnd = void (char '\n') <|> eof