| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255 |
- unit unitMetadata;
- {$mode objfpc}{$H+}
- interface
- uses
- Classes, SysUtils, FileUtil;
- // Extract basic metadata (title, authors, isbn) from a book file.
- // Supports PDF (via pdfinfo) and EPUB (via unzip and parsing the OPF file).
- // Returns True if any metadata was found.
- function ExtractBookMetadata(const FileName: String; out Title, Authors, Isbn: String): Boolean;
- implementation
- uses
- Process, DOM, XMLRead, LazUTF8, StrUtils, LazFileUtils, unitLog;
- function NormalizeISBN(const S: String): String;
- var
- i: Integer; ch: Char; acc: String; src: String;
- begin
- // strip common prefixes
- src := StringReplace(S, 'urn:isbn:', '', [rfIgnoreCase]);
- src := StringReplace(src, 'isbn:', '', [rfIgnoreCase]);
- src := StringReplace(src, 'isbn', '', [rfIgnoreCase]);
- acc := '';
- for i := 1 to Length(src) do
- begin
- ch := src[i];
- if (ch >= '0') and (ch <= '9') then acc += ch
- else if (ch = 'x') or (ch = 'X') then acc += ch
- else if (ch = '-') or (ch = ' ') then Continue
- else if (ch = #9) then Continue
- else ;
- end;
- if (Length(acc) = 10) and (acc[Length(acc)] in ['x','X']) then acc[Length(acc)] := 'X';
- if (Length(acc) = 13) or (Length(acc) = 10) then Result := acc else Result := '';
- end;
- function ExtractPDFMetadata(const FileName: String; out Title, Authors, Isbn: String): Boolean;
- var
- proc: TProcess;
- sl: TStringList;
- line: String;
- i: Integer;
- exe: String;
- env: TStringList;
- begin
- Result := False;
- Title := '';
- Authors := '';
- Isbn := '';
- exe := FindDefaultExecutablePath('pdfinfo');
- if exe = '' then exe := 'pdfinfo';
- LogInfoFmt('pdfinfo tool: %s', [exe]);
- proc := TProcess.Create(nil);
- sl := TStringList.Create;
- env := TStringList.Create;
- try
- try
- // Force English output regardless of user locale and preserve PATH
- env.Add('LC_ALL=C');
- env.Add('LANG=C');
- env.Add('PATH=' + GetEnvironmentVariable('PATH'));
- proc.Environment := env;
- proc.Executable := exe;
- proc.Parameters.Add(FileName);
- proc.Options := [poWaitOnExit, poUsePipes];
- proc.ShowWindow := swoHide;
- LogDebugFmt('Running: %s %s', [proc.Executable, FileName]);
- proc.Execute;
- sl.LoadFromStream(proc.Output);
- LogDebugFmt('pdfinfo exit=%d, output lines=%d', [proc.ExitStatus, sl.Count]);
- for i := 0 to sl.Count - 1 do
- begin
- line := sl[i];
- if (Title = '') and AnsiStartsStr('Title:', line) then
- Title := Trim(Copy(line, 7, MaxInt));
- if (Authors = '') and (AnsiStartsStr('Author:', line) or AnsiStartsStr('Authors:', line)) then
- Authors := Trim(Copy(line, Pos(':', line) + 1, MaxInt));
- if Isbn = '' then
- Isbn := NormalizeISBN(line);
- end;
- Result := (Title <> '') or (Authors <> '') or (Isbn <> '');
- LogInfoFmt('PDF metadata parsed: title="%s" authors="%s" isbn="%s" result=%s',
- [Title, Authors, Isbn, BoolToStr(Result, True)]);
- except
- on E: Exception do
- begin
- LogErrorFmt('pdfinfo failed: %s', [E.Message]);
- Result := False;
- end;
- end;
- finally
- sl.Free;
- env.Free;
- proc.Free;
- end;
- end;
- function ExtractEPUBMetadata(const FileName: String; out Title, Authors, Isbn: String): Boolean;
- var
- proc: TProcess;
- sl: TStringList;
- exe, opfPath, line: String;
- xml: TXMLDocument;
- stream: TStringStream;
- meta, node: TDOMNode;
- i: Integer;
- lname: String;
- env: TStringList;
- begin
- Result := False;
- Title := '';
- Authors := '';
- Isbn := '';
- exe := FindDefaultExecutablePath('unzip');
- if exe = '' then exe := 'unzip';
- LogInfoFmt('unzip tool: %s', [exe]);
- // list files
- proc := TProcess.Create(nil);
- sl := TStringList.Create;
- env := TStringList.Create;
- try
- try
- env.Add('LC_ALL=C'); env.Add('LANG=C');
- env.Add('PATH=' + GetEnvironmentVariable('PATH'));
- proc.Environment := env;
- proc.Executable := exe;
- proc.Parameters.Add('-Z1');
- proc.Parameters.Add(FileName);
- proc.Options := [poWaitOnExit, poUsePipes];
- proc.ShowWindow := swoHide;
- LogDebugFmt('Running: %s -Z1 %s', [proc.Executable, FileName]);
- proc.Execute;
- sl.LoadFromStream(proc.Output);
- LogDebugFmt('unzip -Z1 exit=%d, lines=%d', [proc.ExitStatus, sl.Count]);
- opfPath := '';
- for i := 0 to sl.Count - 1 do
- begin
- line := Trim(sl[i]);
- if LowerCase(ExtractFileExt(line)) = '.opf' then
- begin
- opfPath := line;
- Break;
- end;
- end;
- except
- on E: Exception do
- begin
- LogErrorFmt('unzip -Z1 failed: %s', [E.Message]);
- opfPath := '';
- end;
- end;
- finally
- sl.Free;
- env.Free;
- proc.Free;
- end;
- if opfPath = '' then Exit;
- // extract opf content
- proc := TProcess.Create(nil);
- stream := TStringStream.Create('');
- try
- try
- env := TStringList.Create;
- env.Add('LC_ALL=C'); env.Add('LANG=C');
- env.Add('PATH=' + GetEnvironmentVariable('PATH'));
- proc.Environment := env;
- proc.Executable := exe;
- proc.Parameters.Add('-p');
- proc.Parameters.Add(FileName);
- proc.Parameters.Add(opfPath);
- proc.Options := [poWaitOnExit, poUsePipes];
- proc.ShowWindow := swoHide;
- LogDebugFmt('Running: %s -p %s %s', [proc.Executable, FileName, opfPath]);
- proc.Execute;
- stream.CopyFrom(proc.Output, 0);
- stream.Position := 0;
- except
- on E: Exception do
- begin
- LogErrorFmt('unzip -p failed: %s', [E.Message]);
- stream.Size := 0;
- end;
- end;
- finally
- if Assigned(env) then env.Free;
- proc.Free;
- end;
- try
- try
- ReadXMLFile(xml, stream);
- except
- on E: Exception do
- begin
- LogErrorFmt('ReadXML OPF failed: %s', [E.Message]);
- Exit(False);
- end;
- end;
- try
- meta := xml.DocumentElement.FindNode('metadata');
- if meta <> nil then
- begin
- for i := 0 to meta.ChildNodes.Count - 1 do
- begin
- node := meta.ChildNodes[i];
- lname := UTF8LowerCase(node.NodeName);
- if (Title = '') and ((lname = 'dc:title') or (lname = 'title')) then
- Title := UTF8Encode(Trim(node.TextContent));
- if ((lname = 'dc:creator') or (lname = 'creator') or (lname = 'dc:author') or (lname = 'author')) then
- begin
- if Authors <> '' then Authors := Authors + ', ';
- Authors := Authors + UTF8Encode(Trim(node.TextContent));
- end;
- if (lname = 'dc:identifier') or (lname = 'identifier') then
- begin
- if Isbn = '' then Isbn := NormalizeISBN(UTF8Encode(Trim(node.TextContent)));
- if (Isbn = '') and (node is TDOMElement) then
- Isbn := NormalizeISBN(UTF8Encode(TDOMElement(node).GetAttribute('opf:scheme')));
- end;
- end;
- end;
- finally
- xml.Free;
- end;
- finally
- stream.Free;
- end;
- Result := (Title <> '') or (Authors <> '') or (Isbn <> '');
- LogInfoFmt('EPUB metadata parsed: title="%s" authors="%s" isbn="%s" result=%s',
- [Title, Authors, Isbn, BoolToStr(Result, True)]);
- end;
- function ExtractBookMetadata(const FileName: String; out Title, Authors, Isbn: String): Boolean;
- var
- ext: String;
- begin
- ext := LowerCase(ExtractFileExt(FileName));
- if ext = '.pdf' then
- Result := ExtractPDFMetadata(FileName, Title, Authors, Isbn)
- else if ext = '.epub' then
- Result := ExtractEPUBMetadata(FileName, Title, Authors, Isbn)
- else
- begin
- Title := '';
- Authors := '';
- Isbn := '';
- Result := False;
- end;
- end;
- end.
|