unitmetadata.pas 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. unit unitMetadata;
  2. {$mode objfpc}{$H+}
  3. interface
  4. uses
  5. Classes, SysUtils, FileUtil;
  6. // Extract basic metadata (title, authors, isbn) from a book file.
  7. // Supports PDF (via pdfinfo) and EPUB (via unzip and parsing the OPF file).
  8. // Returns True if any metadata was found.
  9. function ExtractBookMetadata(const FileName: String; out Title, Authors, Isbn: String): Boolean;
  10. implementation
  11. uses
  12. Process, DOM, XMLRead, LazUTF8, StrUtils, LazFileUtils, unitLog;
  13. function NormalizeISBN(const S: String): String;
  14. var
  15. i: Integer; ch: Char; acc: String; src: String;
  16. begin
  17. // strip common prefixes
  18. src := StringReplace(S, 'urn:isbn:', '', [rfIgnoreCase]);
  19. src := StringReplace(src, 'isbn:', '', [rfIgnoreCase]);
  20. src := StringReplace(src, 'isbn', '', [rfIgnoreCase]);
  21. acc := '';
  22. for i := 1 to Length(src) do
  23. begin
  24. ch := src[i];
  25. if (ch >= '0') and (ch <= '9') then acc += ch
  26. else if (ch = 'x') or (ch = 'X') then acc += ch
  27. else if (ch = '-') or (ch = ' ') then Continue
  28. else if (ch = #9) then Continue
  29. else ;
  30. end;
  31. if (Length(acc) = 10) and (acc[Length(acc)] in ['x','X']) then acc[Length(acc)] := 'X';
  32. if (Length(acc) = 13) or (Length(acc) = 10) then Result := acc else Result := '';
  33. end;
  34. function ExtractPDFMetadata(const FileName: String; out Title, Authors, Isbn: String): Boolean;
  35. var
  36. proc: TProcess;
  37. sl: TStringList;
  38. line: String;
  39. i: Integer;
  40. exe: String;
  41. env: TStringList;
  42. begin
  43. Result := False;
  44. Title := '';
  45. Authors := '';
  46. Isbn := '';
  47. exe := FindDefaultExecutablePath('pdfinfo');
  48. if exe = '' then exe := 'pdfinfo';
  49. LogInfoFmt('pdfinfo tool: %s', [exe]);
  50. proc := TProcess.Create(nil);
  51. sl := TStringList.Create;
  52. env := TStringList.Create;
  53. try
  54. try
  55. // Force English output regardless of user locale and preserve PATH
  56. env.Add('LC_ALL=C');
  57. env.Add('LANG=C');
  58. env.Add('PATH=' + GetEnvironmentVariable('PATH'));
  59. proc.Environment := env;
  60. proc.Executable := exe;
  61. proc.Parameters.Add(FileName);
  62. proc.Options := [poWaitOnExit, poUsePipes];
  63. proc.ShowWindow := swoHide;
  64. LogDebugFmt('Running: %s %s', [proc.Executable, FileName]);
  65. proc.Execute;
  66. sl.LoadFromStream(proc.Output);
  67. LogDebugFmt('pdfinfo exit=%d, output lines=%d', [proc.ExitStatus, sl.Count]);
  68. for i := 0 to sl.Count - 1 do
  69. begin
  70. line := sl[i];
  71. if (Title = '') and AnsiStartsStr('Title:', line) then
  72. Title := Trim(Copy(line, 7, MaxInt));
  73. if (Authors = '') and (AnsiStartsStr('Author:', line) or AnsiStartsStr('Authors:', line)) then
  74. Authors := Trim(Copy(line, Pos(':', line) + 1, MaxInt));
  75. if Isbn = '' then
  76. Isbn := NormalizeISBN(line);
  77. end;
  78. Result := (Title <> '') or (Authors <> '') or (Isbn <> '');
  79. LogInfoFmt('PDF metadata parsed: title="%s" authors="%s" isbn="%s" result=%s',
  80. [Title, Authors, Isbn, BoolToStr(Result, True)]);
  81. except
  82. on E: Exception do
  83. begin
  84. LogErrorFmt('pdfinfo failed: %s', [E.Message]);
  85. Result := False;
  86. end;
  87. end;
  88. finally
  89. sl.Free;
  90. env.Free;
  91. proc.Free;
  92. end;
  93. end;
  94. function ExtractEPUBMetadata(const FileName: String; out Title, Authors, Isbn: String): Boolean;
  95. var
  96. proc: TProcess;
  97. sl: TStringList;
  98. exe, opfPath, line: String;
  99. xml: TXMLDocument;
  100. stream: TStringStream;
  101. meta, node: TDOMNode;
  102. i: Integer;
  103. lname: String;
  104. env: TStringList;
  105. begin
  106. Result := False;
  107. Title := '';
  108. Authors := '';
  109. Isbn := '';
  110. exe := FindDefaultExecutablePath('unzip');
  111. if exe = '' then exe := 'unzip';
  112. LogInfoFmt('unzip tool: %s', [exe]);
  113. // list files
  114. proc := TProcess.Create(nil);
  115. sl := TStringList.Create;
  116. env := TStringList.Create;
  117. try
  118. try
  119. env.Add('LC_ALL=C'); env.Add('LANG=C');
  120. env.Add('PATH=' + GetEnvironmentVariable('PATH'));
  121. proc.Environment := env;
  122. proc.Executable := exe;
  123. proc.Parameters.Add('-Z1');
  124. proc.Parameters.Add(FileName);
  125. proc.Options := [poWaitOnExit, poUsePipes];
  126. proc.ShowWindow := swoHide;
  127. LogDebugFmt('Running: %s -Z1 %s', [proc.Executable, FileName]);
  128. proc.Execute;
  129. sl.LoadFromStream(proc.Output);
  130. LogDebugFmt('unzip -Z1 exit=%d, lines=%d', [proc.ExitStatus, sl.Count]);
  131. opfPath := '';
  132. for i := 0 to sl.Count - 1 do
  133. begin
  134. line := Trim(sl[i]);
  135. if LowerCase(ExtractFileExt(line)) = '.opf' then
  136. begin
  137. opfPath := line;
  138. Break;
  139. end;
  140. end;
  141. except
  142. on E: Exception do
  143. begin
  144. LogErrorFmt('unzip -Z1 failed: %s', [E.Message]);
  145. opfPath := '';
  146. end;
  147. end;
  148. finally
  149. sl.Free;
  150. env.Free;
  151. proc.Free;
  152. end;
  153. if opfPath = '' then Exit;
  154. // extract opf content
  155. proc := TProcess.Create(nil);
  156. stream := TStringStream.Create('');
  157. try
  158. try
  159. env := TStringList.Create;
  160. env.Add('LC_ALL=C'); env.Add('LANG=C');
  161. env.Add('PATH=' + GetEnvironmentVariable('PATH'));
  162. proc.Environment := env;
  163. proc.Executable := exe;
  164. proc.Parameters.Add('-p');
  165. proc.Parameters.Add(FileName);
  166. proc.Parameters.Add(opfPath);
  167. proc.Options := [poWaitOnExit, poUsePipes];
  168. proc.ShowWindow := swoHide;
  169. LogDebugFmt('Running: %s -p %s %s', [proc.Executable, FileName, opfPath]);
  170. proc.Execute;
  171. stream.CopyFrom(proc.Output, 0);
  172. stream.Position := 0;
  173. except
  174. on E: Exception do
  175. begin
  176. LogErrorFmt('unzip -p failed: %s', [E.Message]);
  177. stream.Size := 0;
  178. end;
  179. end;
  180. finally
  181. if Assigned(env) then env.Free;
  182. proc.Free;
  183. end;
  184. try
  185. try
  186. ReadXMLFile(xml, stream);
  187. except
  188. on E: Exception do
  189. begin
  190. LogErrorFmt('ReadXML OPF failed: %s', [E.Message]);
  191. Exit(False);
  192. end;
  193. end;
  194. try
  195. meta := xml.DocumentElement.FindNode('metadata');
  196. if meta <> nil then
  197. begin
  198. for i := 0 to meta.ChildNodes.Count - 1 do
  199. begin
  200. node := meta.ChildNodes[i];
  201. lname := UTF8LowerCase(node.NodeName);
  202. if (Title = '') and ((lname = 'dc:title') or (lname = 'title')) then
  203. Title := UTF8Encode(Trim(node.TextContent));
  204. if ((lname = 'dc:creator') or (lname = 'creator') or (lname = 'dc:author') or (lname = 'author')) then
  205. begin
  206. if Authors <> '' then Authors := Authors + ', ';
  207. Authors := Authors + UTF8Encode(Trim(node.TextContent));
  208. end;
  209. if (lname = 'dc:identifier') or (lname = 'identifier') then
  210. begin
  211. if Isbn = '' then Isbn := NormalizeISBN(UTF8Encode(Trim(node.TextContent)));
  212. if (Isbn = '') and (node is TDOMElement) then
  213. Isbn := NormalizeISBN(UTF8Encode(TDOMElement(node).GetAttribute('opf:scheme')));
  214. end;
  215. end;
  216. end;
  217. finally
  218. xml.Free;
  219. end;
  220. finally
  221. stream.Free;
  222. end;
  223. Result := (Title <> '') or (Authors <> '') or (Isbn <> '');
  224. LogInfoFmt('EPUB metadata parsed: title="%s" authors="%s" isbn="%s" result=%s',
  225. [Title, Authors, Isbn, BoolToStr(Result, True)]);
  226. end;
  227. function ExtractBookMetadata(const FileName: String; out Title, Authors, Isbn: String): Boolean;
  228. var
  229. ext: String;
  230. begin
  231. ext := LowerCase(ExtractFileExt(FileName));
  232. if ext = '.pdf' then
  233. Result := ExtractPDFMetadata(FileName, Title, Authors, Isbn)
  234. else if ext = '.epub' then
  235. Result := ExtractEPUBMetadata(FileName, Title, Authors, Isbn)
  236. else
  237. begin
  238. Title := '';
  239. Authors := '';
  240. Isbn := '';
  241. Result := False;
  242. end;
  243. end;
  244. end.