parser.go 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. /*
  2. * MIT License
  3. *
  4. * Copyright (c) 2020 Alexey Edelev <semlanik@gmail.com>
  5. *
  6. * This file is part of gostfix project https://git.semlanik.org/semlanik/gostfix
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy of this
  9. * software and associated documentation files (the "Software"), to deal in the Software
  10. * without restriction, including without limitation the rights to use, copy, modify,
  11. * merge, publish, distribute, sublicense, and/or sell copies of the Software, and
  12. * to permit persons to whom the Software is furnished to do so, subject to the following
  13. * conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all copies
  16. * or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
  19. * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
  20. * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
  21. * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  22. * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23. * DEALINGS IN THE SOFTWARE.
  24. */
  25. package scanner
  26. import (
  27. "bufio"
  28. "bytes"
  29. "encoding/hex"
  30. "log"
  31. "os"
  32. "strings"
  33. "time"
  34. "git.semlanik.org/semlanik/gostfix/common"
  35. utils "git.semlanik.org/semlanik/gostfix/utils"
  36. "github.com/google/uuid"
  37. enmime "github.com/jhillyerd/enmime"
  38. )
  39. const (
  40. StateHeaderScan = iota
  41. StateBodyScan
  42. )
  43. const (
  44. AtLeastOneHeaderMask = 1 << iota
  45. FromHeaderMask
  46. DateHeaderMask
  47. ToHeaderMask
  48. AllHeaderMask = 15
  49. )
  50. type parseData struct {
  51. state int
  52. mandatoryHeaders int
  53. previousHeader *string
  54. email *common.Mail
  55. bodyContentType string
  56. bodyData string
  57. activeBoundary string
  58. }
  59. func (pd *parseData) reset() {
  60. *pd = parseData{
  61. state: StateHeaderScan,
  62. previousHeader: nil,
  63. mandatoryHeaders: 0,
  64. email: NewEmail(),
  65. bodyContentType: "plain/text",
  66. bodyData: "",
  67. activeBoundary: "",
  68. }
  69. }
  70. func parseFile(file *utils.LockedFile) []*common.Mail {
  71. var emails []*common.Mail
  72. pd := &parseData{}
  73. pd.reset()
  74. scanner := bufio.NewScanner(file)
  75. for scanner.Scan() {
  76. switch pd.state {
  77. case StateHeaderScan:
  78. if scanner.Text() == "" {
  79. if pd.mandatoryHeaders&AtLeastOneHeaderMask == AtLeastOneHeaderMask { //Cause we read at least one header
  80. pd.previousHeader = nil
  81. boundaryCapture := utils.RegExpUtilsInstance().BoundaryFinder.FindStringSubmatch(pd.bodyContentType)
  82. if len(boundaryCapture) == 2 {
  83. pd.activeBoundary = boundaryCapture[1]
  84. } else {
  85. pd.activeBoundary = ""
  86. }
  87. pd.state = StateBodyScan
  88. }
  89. } else {
  90. pd.parseHeader(scanner.Text())
  91. }
  92. case StateBodyScan:
  93. if scanner.Text() == "" {
  94. if pd.state == StateBodyScan && pd.activeBoundary == "" {
  95. if pd.mandatoryHeaders == AllHeaderMask {
  96. pd.parseBody()
  97. emails = append(emails, pd.email)
  98. }
  99. pd.reset()
  100. continue
  101. }
  102. }
  103. if pd.activeBoundary != "" {
  104. pd.bodyData += scanner.Text() + "\n"
  105. capture := utils.RegExpUtilsInstance().BoundaryEndFinder.FindStringSubmatch(scanner.Text())
  106. if len(capture) == 2 && pd.activeBoundary == capture[1] {
  107. pd.state = StateBodyScan
  108. pd.activeBoundary = ""
  109. }
  110. }
  111. }
  112. }
  113. if pd.state == StateBodyScan {
  114. if pd.mandatoryHeaders == AllHeaderMask {
  115. pd.parseBody()
  116. emails = append(emails, pd.email)
  117. }
  118. pd.reset()
  119. }
  120. return emails
  121. }
  122. func (pd *parseData) parseHeader(headerRaw string) {
  123. capture := utils.RegExpUtilsInstance().HeaderFinder.FindStringSubmatch(headerRaw)
  124. //Parse header
  125. if len(capture) == 3 {
  126. // fmt.Printf("capture Header %s : %s\n", strings.ToLower(capture[0]), strings.ToLower(capture[1]))
  127. header := strings.ToLower(capture[1])
  128. pd.mandatoryHeaders |= AtLeastOneHeaderMask
  129. switch header {
  130. case "from":
  131. pd.previousHeader = &pd.email.Header.From
  132. pd.mandatoryHeaders |= FromHeaderMask
  133. case "to":
  134. pd.previousHeader = &pd.email.Header.To
  135. pd.mandatoryHeaders |= ToHeaderMask
  136. case "cc":
  137. pd.previousHeader = &pd.email.Header.Cc
  138. case "bcc":
  139. pd.previousHeader = &pd.email.Header.Bcc
  140. pd.mandatoryHeaders |= ToHeaderMask
  141. case "subject":
  142. pd.previousHeader = &pd.email.Header.Subject
  143. case "date":
  144. pd.previousHeader = nil
  145. time, err := time.Parse(time.RFC1123Z, strings.Trim(capture[2], " \t"))
  146. if err == nil {
  147. pd.email.Header.Date = time.Unix()
  148. pd.mandatoryHeaders |= DateHeaderMask
  149. }
  150. log.Printf("Invalid date format %s, %s", strings.Trim(capture[2], " \t"), err)
  151. case "content-type":
  152. pd.previousHeader = &pd.bodyContentType
  153. default:
  154. pd.previousHeader = nil
  155. }
  156. if pd.previousHeader != nil {
  157. *pd.previousHeader = strings.Trim(capture[2], " \t")
  158. }
  159. return
  160. }
  161. //Parse folding
  162. capture = utils.RegExpUtilsInstance().FoldingFinder.FindStringSubmatch(headerRaw)
  163. if len(capture) == 2 && pd.previousHeader != nil {
  164. *pd.previousHeader += capture[1]
  165. }
  166. }
  167. func (pd *parseData) parseBody() {
  168. buffer := bytes.NewBufferString("content-type:" + pd.bodyContentType + "\n\n" + pd.bodyData)
  169. en, err := enmime.ReadEnvelope(buffer)
  170. if err != nil {
  171. log.Printf("Unable to read mail body %s\n\nBody content: %s\n\n", err, pd.bodyData)
  172. return
  173. }
  174. pd.email.Body = &common.MailBody{}
  175. pd.email.Body.PlainText = en.Text
  176. pd.email.Body.RichText = en.HTML
  177. for _, attachment := range en.Attachments {
  178. uuid := uuid.New()
  179. fileName := hex.EncodeToString(uuid[:])
  180. attachmentFile, err := os.Create(fileName)
  181. log.Printf("Attachment found %s\n", fileName)
  182. if err != nil {
  183. continue
  184. }
  185. pd.email.Body.Attachments = append(pd.email.Body.Attachments, &common.AttachmentHeader{
  186. Id: fileName,
  187. FileName: attachment.FileName,
  188. ContentType: attachment.ContentType,
  189. })
  190. attachmentFile.Write(attachment.Content)
  191. }
  192. }