parser.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. /*
  2. * MIT License
  3. *
  4. * Copyright (c) 2020 Alexey Edelev <semlanik@gmail.com>
  5. *
  6. * This file is part of gostfix project https://git.semlanik.org/semlanik/gostfix
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy of this
  9. * software and associated documentation files (the "Software"), to deal in the Software
  10. * without restriction, including without limitation the rights to use, copy, modify,
  11. * merge, publish, distribute, sublicense, and/or sell copies of the Software, and
  12. * to permit persons to whom the Software is furnished to do so, subject to the following
  13. * conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all copies
  16. * or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
  19. * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
  20. * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
  21. * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  22. * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23. * DEALINGS IN THE SOFTWARE.
  24. */
  25. package scanner
  26. import (
  27. "bufio"
  28. "bytes"
  29. "encoding/base64"
  30. "encoding/hex"
  31. "fmt"
  32. "io/ioutil"
  33. "log"
  34. "mime/quotedprintable"
  35. "os"
  36. "strings"
  37. "net/mail"
  38. "git.semlanik.org/semlanik/gostfix/common"
  39. "git.semlanik.org/semlanik/gostfix/config"
  40. utils "git.semlanik.org/semlanik/gostfix/utils"
  41. "github.com/google/uuid"
  42. enmime "github.com/jhillyerd/enmime"
  43. )
  44. const (
  45. StateHeaderScan = iota
  46. StateBodyScan
  47. )
  48. const (
  49. AtLeastOneHeaderMask = 1 << iota
  50. FromHeaderMask
  51. DateHeaderMask
  52. ToHeaderMask
  53. AllHeaderMask = 15
  54. )
  55. type parseData struct {
  56. state int
  57. mandatoryHeaders int
  58. previousHeader *string
  59. email *common.Mail
  60. contentTransferEncoding string
  61. bodyContentType string
  62. bodyData string
  63. activeBoundary string
  64. }
  65. func (pd *parseData) reset() {
  66. *pd = parseData{
  67. state: StateHeaderScan,
  68. previousHeader: nil,
  69. mandatoryHeaders: 0,
  70. email: common.NewMail(),
  71. bodyContentType: "plain/text",
  72. bodyData: "",
  73. activeBoundary: "",
  74. }
  75. }
  76. func parseFile(file *utils.LockedFile) []*common.Mail {
  77. log.Println("Parse file")
  78. defer log.Println("Exit parse")
  79. var emails []*common.Mail
  80. pd := &parseData{}
  81. pd.reset()
  82. scanner := bufio.NewScanner(file)
  83. for scanner.Scan() {
  84. currentText := scanner.Text()
  85. if utils.RegExpUtilsInstance().MailIndicator.MatchString(currentText) {
  86. if pd.mandatoryHeaders == AllHeaderMask {
  87. pd.parseBody()
  88. emails = append(emails, pd.email)
  89. }
  90. pd.reset()
  91. fmt.Println("Found new email" + currentText)
  92. continue
  93. }
  94. switch pd.state {
  95. case StateHeaderScan:
  96. if currentText == "" {
  97. if pd.mandatoryHeaders&AtLeastOneHeaderMask == AtLeastOneHeaderMask { //Cause we read at least one header
  98. pd.previousHeader = nil
  99. boundaryCapture := utils.RegExpUtilsInstance().BoundaryFinder.FindStringSubmatch(pd.bodyContentType)
  100. if len(boundaryCapture) == 2 {
  101. pd.activeBoundary = boundaryCapture[1]
  102. } else {
  103. pd.activeBoundary = ""
  104. }
  105. pd.state = StateBodyScan
  106. //Header postprocessing
  107. address, err := mail.ParseAddress(pd.email.Header.From)
  108. if err == nil {
  109. pd.email.Header.From = address.Name + "<" + address.Address + ">"
  110. } else {
  111. fmt.Printf("Unable to parse from email: %s", err)
  112. }
  113. }
  114. } else {
  115. pd.parseHeader(currentText)
  116. }
  117. case StateBodyScan:
  118. pd.bodyData += currentText + "\n"
  119. capture := utils.RegExpUtilsInstance().BoundaryEndFinder.FindStringSubmatch(currentText)
  120. if len(capture) == 2 && pd.activeBoundary == capture[1] {
  121. pd.state = StateBodyScan
  122. pd.activeBoundary = ""
  123. }
  124. }
  125. }
  126. if pd.state == StateBodyScan {
  127. if pd.mandatoryHeaders == AllHeaderMask {
  128. pd.parseBody()
  129. emails = append(emails, pd.email)
  130. }
  131. pd.reset()
  132. }
  133. return emails
  134. }
  135. func (pd *parseData) parseHeader(headerRaw string) {
  136. capture := utils.RegExpUtilsInstance().HeaderFinder.FindStringSubmatch(headerRaw)
  137. encoded := false
  138. //Parse header
  139. if len(capture) == 3 {
  140. // fmt.Printf("capture Header %s : %s\n", strings.ToLower(capture[0]), strings.ToLower(capture[1]))
  141. header := strings.ToLower(capture[1])
  142. pd.mandatoryHeaders |= AtLeastOneHeaderMask
  143. switch header {
  144. case "from":
  145. pd.previousHeader = &pd.email.Header.From
  146. pd.mandatoryHeaders |= FromHeaderMask
  147. case "to":
  148. pd.previousHeader = &pd.email.Header.To
  149. pd.mandatoryHeaders |= ToHeaderMask
  150. case "x-original-to":
  151. if pd.email.Header.To == "" {
  152. pd.previousHeader = &pd.email.Header.To
  153. pd.mandatoryHeaders |= ToHeaderMask
  154. }
  155. case "cc":
  156. pd.previousHeader = &pd.email.Header.Cc
  157. case "bcc":
  158. pd.previousHeader = &pd.email.Header.Bcc
  159. pd.mandatoryHeaders |= ToHeaderMask
  160. case "subject":
  161. encoded = true
  162. pd.previousHeader = &pd.email.Header.Subject
  163. case "date":
  164. pd.previousHeader = nil
  165. unixTime, err := mail.ParseDate(strings.Trim(capture[2], " \t"))
  166. if err == nil {
  167. pd.email.Header.Date = unixTime.Unix()
  168. pd.mandatoryHeaders |= DateHeaderMask
  169. } else {
  170. log.Printf("Unable to parse message: %s\n", err)
  171. }
  172. case "content-transfer-encoding":
  173. pd.previousHeader = &pd.contentTransferEncoding
  174. case "content-type":
  175. pd.previousHeader = &pd.bodyContentType
  176. default:
  177. pd.previousHeader = nil
  178. }
  179. if pd.previousHeader != nil {
  180. *pd.previousHeader = strings.Trim(capture[2], " \t")
  181. if encoded {
  182. *pd.previousHeader = decodeEncoded(*pd.previousHeader)
  183. }
  184. }
  185. return
  186. }
  187. //Parse folding
  188. capture = utils.RegExpUtilsInstance().FoldingFinder.FindStringSubmatch(headerRaw)
  189. if len(capture) == 2 && pd.previousHeader != nil {
  190. *pd.previousHeader += decodeEncoded(strings.Trim(capture[1], " \t"))
  191. }
  192. }
  193. func (pd *parseData) parseBody() {
  194. buffer := bytes.NewBufferString("content-transfer-encoding: " + pd.contentTransferEncoding + "\ncontent-type: " + pd.bodyContentType + "\n\n" + pd.bodyData)
  195. en, err := enmime.ReadEnvelope(buffer)
  196. if err != nil {
  197. log.Printf("Unable to read mail body %s\n\nBody content: %s\n\n", err, pd.bodyData)
  198. return
  199. }
  200. pd.email.Body = &common.MailBody{}
  201. pd.email.Body.PlainText = en.Text
  202. pd.email.Body.RichText = en.HTML
  203. for _, attachment := range en.Attachments {
  204. uuid := uuid.New()
  205. fileName := hex.EncodeToString(uuid[:])
  206. attachmentFile, err := os.Create(config.ConfigInstance().AttachmentsPath + "/" + fileName)
  207. log.Printf("Attachment found %s\n", fileName)
  208. if err != nil {
  209. log.Printf("Unable to save attachment %s %s\n", fileName, err)
  210. continue
  211. }
  212. pd.email.Body.Attachments = append(pd.email.Body.Attachments, &common.AttachmentHeader{
  213. Id: fileName,
  214. FileName: attachment.FileName,
  215. ContentType: attachment.ContentType,
  216. })
  217. attachmentFile.Write(attachment.Content)
  218. }
  219. }
  220. func decodeEncoded(dataEncoded string) string {
  221. dataParts := utils.RegExpUtilsInstance().EncodedStringFinder.FindAllString(dataEncoded, -1)
  222. if len(dataParts) <= 0 {
  223. return dataEncoded
  224. }
  225. var decodedBuffer []byte
  226. for _, headerPart := range dataParts {
  227. headerPart = headerPart[2 : len(headerPart)-2]
  228. headerPartParts := strings.Split(headerPart, "?")
  229. if len(headerPartParts) == 3 {
  230. switch strings.ToLower(headerPartParts[1]) {
  231. case "b":
  232. fmt.Printf("Decode base64: %s\n", headerPartParts[2])
  233. decodedBase64, err := base64.StdEncoding.DecodeString(headerPartParts[2])
  234. if err == nil {
  235. decodedBuffer = append(decodedBuffer, decodedBase64...)
  236. }
  237. case "q":
  238. decodedQuotedPrintable, err := ioutil.ReadAll(quotedprintable.NewReader(strings.NewReader(headerPartParts[2])))
  239. if err == nil {
  240. decodedBuffer = append(decodedBuffer, decodedQuotedPrintable...)
  241. }
  242. default:
  243. }
  244. }
  245. }
  246. if len(decodedBuffer) > 0 {
  247. //TODO: check encoding here
  248. return string(decodedBuffer)
  249. }
  250. return dataEncoded
  251. }