tikiwiki/packages/tiki-pkg-mediaalchemyst/phpexiftool/exiftool/lib/Image/ExifTool/PostScript.pm

700 lines
26 KiB
Perl
Raw Permalink Normal View History

2023-11-20 21:52:04 +01:00
#------------------------------------------------------------------------------
# File: PostScript.pm
#
# Description: Read PostScript meta information
#
# Revisions: 07/08/2005 - P. Harvey Created
#
# References: 1) http://partners.adobe.com/public/developer/en/ps/5002.EPSF_Spec.pdf
# 2) http://partners.adobe.com/public/developer/en/ps/5001.DSC_Spec.pdf
# 3) http://partners.adobe.com/public/developer/en/illustrator/sdk/AI7FileFormat.pdf
#------------------------------------------------------------------------------
package Image::ExifTool::PostScript;
use strict;
use vars qw($VERSION $AUTOLOAD);
use Image::ExifTool qw(:DataAccess :Utils);
$VERSION = '1.41';
sub WritePS($$);
sub ProcessPS($$;$);
# PostScript tag table
%Image::ExifTool::PostScript::Main = (
PROCESS_PROC => \&ProcessPS,
WRITE_PROC => \&WritePS,
PREFERRED => 1, # always add these tags when writing
GROUPS => { 2 => 'Image' },
# Note: Make all of these tags priority 0 since the first one found at
# the start of the file should take priority (in case multiples exist)
Author => { Priority => 0, Groups => { 2 => 'Author' }, Writable => 'string' },
BoundingBox => { Priority => 0 },
Copyright => { Priority => 0, Writable => 'string' }, #2
CreationDate => {
Name => 'CreateDate',
Priority => 0,
Groups => { 2 => 'Time' },
Writable => 'string',
PrintConv => '$self->ConvertDateTime($val)',
PrintConvInv => '$self->InverseDateTime($val)',
},
Creator => { Priority => 0, Writable => 'string' },
ImageData => { Priority => 0 },
For => { Priority => 0, Writable => 'string', Notes => 'for whom the document was prepared'},
Keywords => { Priority => 0, Writable => 'string' },
ModDate => {
Name => 'ModifyDate',
Priority => 0,
Groups => { 2 => 'Time' },
Writable => 'string',
PrintConv => '$self->ConvertDateTime($val)',
PrintConvInv => '$self->InverseDateTime($val)',
},
Pages => { Priority => 0 },
Routing => { Priority => 0, Writable => 'string' }, #2
Subject => { Priority => 0, Writable => 'string' },
Title => { Priority => 0, Writable => 'string' },
Version => { Priority => 0, Writable => 'string' }, #2
# these subdirectories for documentation only
BeginPhotoshop => {
Name => 'PhotoshopData',
SubDirectory => {
TagTable => 'Image::ExifTool::Photoshop::Main',
},
},
BeginICCProfile => {
Name => 'ICC_Profile',
SubDirectory => {
TagTable => 'Image::ExifTool::ICC_Profile::Main',
},
},
begin_xml_packet => {
Name => 'XMP',
SubDirectory => {
TagTable => 'Image::ExifTool::XMP::Main',
},
},
TIFFPreview => {
Groups => { 2 => 'Preview' },
Binary => 1,
Notes => q{
not a real tag ID, but used to represent the TIFF preview extracted from DOS
EPS images
},
},
BeginDocument => {
Name => 'EmbeddedFile',
SubDirectory => {
TagTable => 'Image::ExifTool::PostScript::Main',
},
Notes => 'extracted with ExtractEmbedded option',
},
EmbeddedFileName => {
Notes => q{
not a real tag ID, but the file name from a BeginDocument statement.
Extracted with document metadata when ExtractEmbedded option is used
},
},
);
# composite tags
%Image::ExifTool::PostScript::Composite = (
GROUPS => { 2 => 'Image' },
# BoundingBox is in points, not pixels,
# but use it anyway if ImageData is not available
ImageWidth => {
Desire => {
0 => 'Main:PostScript:ImageData',
1 => 'PostScript:BoundingBox',
},
ValueConv => 'Image::ExifTool::PostScript::ImageSize(\@val, 0)',
},
ImageHeight => {
Desire => {
0 => 'Main:PostScript:ImageData',
1 => 'PostScript:BoundingBox',
},
ValueConv => 'Image::ExifTool::PostScript::ImageSize(\@val, 1)',
},
);
# add our composite tags
Image::ExifTool::AddCompositeTags('Image::ExifTool::PostScript');
#------------------------------------------------------------------------------
# AutoLoad our writer routines when necessary
#
sub AUTOLOAD
{
return Image::ExifTool::DoAutoLoad($AUTOLOAD, @_);
}
#------------------------------------------------------------------------------
# Is this a PC system
# Returns: true for PC systems
my %isPC = (MSWin32 => 1, os2 => 1, dos => 1, NetWare => 1, symbian => 1, cygwin => 1);
sub IsPC()
{
return $isPC{$^O};
}
#------------------------------------------------------------------------------
# Get image width or height
# Inputs: 0) value list ref (ImageData, BoundingBox), 1) true to get height
sub ImageSize($$)
{
my ($vals, $getHeight) = @_;
my ($w, $h);
if ($$vals[0] and $$vals[0] =~ /^(\d+) (\d+)/) {
($w, $h) = ($1, $2);
} elsif ($$vals[1] and $$vals[1] =~ /^(\d+) (\d+) (\d+) (\d+)/) {
($w, $h) = ($3 - $1, $4 - $2);
}
return $getHeight ? $h : $w;
}
#------------------------------------------------------------------------------
# Set PostScript format error warning
# Inputs: 0) ExifTool object reference, 1) error string
# Returns: 1
sub PSErr($$)
{
my ($et, $str) = @_;
# set file type if not done already
my $ext = $$et{FILE_EXT};
$et->SetFileType(($ext and $ext eq 'AI') ? 'AI' : 'PS');
$et->Warn("PostScript format error ($str)");
return 1;
}
#------------------------------------------------------------------------------
# Return input record separator to use for the specified file
# Inputs: 0) RAF reference
# Returns: Input record separator or undef on error
sub GetInputRecordSeparator($)
{
my $raf = shift;
my $pos = $raf->Tell(); # save current position
my ($data, $sep);
$raf->Read($data,256) or return undef;
my ($a, $d) = (999,999);
$a = pos($data), pos($data) = 0 if $data =~ /\x0a/g;
$d = pos($data) if $data =~ /\x0d/g;
my $diff = $a - $d;
if ($diff eq 1) {
$sep = "\x0d\x0a";
} elsif ($diff eq -1) {
$sep = "\x0a\x0d";
} elsif ($diff > 0) {
$sep = "\x0d";
} elsif ($diff < 0) {
$sep = "\x0a";
} # else error
$raf->Seek($pos, 0); # restore original position
return $sep;
}
#------------------------------------------------------------------------------
# Decode comment from PostScript file
# Inputs: 0) comment string, 1) RAF ref, 2) reference to lines array
# 3) optional data reference for extra lines read from file
# Returns: Decoded comment string (may be an array reference)
# - handles multi-line comments and escape sequences
sub DecodeComment($$$;$)
{
my ($val, $raf, $lines, $dataPt) = @_;
$val =~ s/\x0d*\x0a*$//; # remove trailing CR, LF or CR/LF
# check for continuation comments
for (;;) {
unless (@$lines) {
my $buff;
$raf->ReadLine($buff) or last;
my $altnl = $/ eq "\x0d" ? "\x0a" : "\x0d";
if ($buff =~ /$altnl/) {
chomp $buff if $/ eq "\x0d\x0a"; # remove DOS newline before splitting
# split into separate lines
@$lines = split /$altnl/, $buff, -1;
# handle case of DOS newline data inside file using Unix newlines
@$lines = ( $$lines[0] . $$lines[1] ) if @$lines == 2 and $$lines[1] eq $/;
# add back trailing DOS newline if necessary
@$lines ? @$lines[-1] .= $/ : push @$lines, $/ if $/ eq "\x0d\x0a";
} else {
push @$lines, $buff;
}
}
last unless $$lines[0] =~ /^%%\+/; # is the next line a continuation?
$$dataPt .= $$lines[0] if $dataPt; # add to data if necessary
$$lines[0] =~ s/\x0d*\x0a*$//; # remove trailing CR, LF or CR/LF
$val .= substr(shift(@$lines), 3); # add to value (without leading "%%+")
}
my @vals;
# handle bracketed string values
if ($val =~ s/^\((.*)\)$/$1/) { # remove brackets if necessary
# split into an array of strings if necessary
my $nesting = 1;
while ($val =~ /(\(|\))/g) {
my $bra = $1;
my $pos = pos($val) - 2;
my $backslashes = 0;
while ($pos and substr($val, $pos, 1) eq '\\') {
--$pos;
++$backslashes;
}
next if $backslashes & 0x01; # escaped if odd number
if ($bra eq '(') {
++$nesting;
} else {
--$nesting;
unless ($nesting) {
push @vals, substr($val, 0, pos($val)-1);
$val = substr($val, pos($val));
++$nesting if $val =~ s/\s*\(//;
}
}
}
push @vals, $val;
foreach $val (@vals) {
# decode escape sequences in bracketed strings
# (similar to code in PDF.pm, but without line continuation)
while ($val =~ /\\(.)/sg) {
my $n = pos($val) - 2;
my $c = $1;
my $r;
if ($c =~ /[0-7]/) {
# get up to 2 more octal digits
$c .= $1 if $val =~ /\G([0-7]{1,2})/g;
# convert octal escape code
$r = chr(oct($c) & 0xff);
} else {
# convert escaped characters
($r = $c) =~ tr/nrtbf/\n\r\t\b\f/;
}
substr($val, $n, length($c)+1) = $r;
# continue search after this character
pos($val) = $n + length($r);
}
}
$val = @vals > 1 ? \@vals : $vals[0];
}
return $val;
}
#------------------------------------------------------------------------------
# Unescape PostScript string
# Inputs: 0) string
# Returns: unescaped string
sub UnescapePostScript($)
{
my $str = shift;
# decode escape sequences in literal strings
while ($str =~ /\\(.)/sg) {
my $n = pos($str) - 2;
my $c = $1;
my $r;
if ($c =~ /[0-7]/) {
# get up to 2 more octal digits
$c .= $1 if $str =~ /\G([0-7]{1,2})/g;
# convert octal escape code
$r = chr(oct($c) & 0xff);
} elsif ($c eq "\x0d") {
# the string is continued if the line ends with '\'
# (also remove "\x0d\x0a")
$c .= $1 if $str =~ /\G(\x0a)/g;
$r = '';
} elsif ($c eq "\x0a") {
$r = '';
} else {
# convert escaped characters
($r = $c) =~ tr/nrtbf/\n\r\t\b\f/;
}
substr($str, $n, length($c)+1) = $r;
# continue search after this character
pos($str) = $n + length($r);
}
return $str;
}
#------------------------------------------------------------------------------
# Extract information from EPS, PS or AI file
# Inputs: 0) ExifTool object reference, 1) dirInfo reference, 2) optional tag table ref
# Returns: 1 if this was a valid PostScript file
sub ProcessPS($$;$)
{
my ($et, $dirInfo, $tagTablePtr) = @_;
my $raf = $$dirInfo{RAF};
my $embedded = $et->Options('ExtractEmbedded');
my ($data, $dos, $endDoc, $fontTable, $comment);
# allow read from data
unless ($raf) {
$raf = new File::RandomAccess($$dirInfo{DataPt});
$et->VerboseDir('PostScript');
}
#
# determine if this is a postscript file
#
$raf->Read($data, 4) == 4 or return 0;
# accept either ASCII or DOS binary postscript file format
return 0 unless $data =~ /^(%!PS|%!Ad|%!Fo|\xc5\xd0\xd3\xc6)/;
if ($data =~ /^%!Ad/) {
# I've seen PS files start with "%!Adobe-PS"...
return 0 unless $raf->Read($data, 6) == 6 and $data eq "obe-PS";
} elsif ($data =~ /^\xc5\xd0\xd3\xc6/) {
# process DOS binary file header
# - save DOS header then seek ahead and check PS header
$raf->Read($dos, 26) == 26 or return 0;
SetByteOrder('II');
unless ($raf->Seek(Get32u(\$dos, 0), 0) and
$raf->Read($data, 4) == 4 and $data eq '%!PS')
{
return PSErr($et, 'invalid header');
}
} else {
# check for PostScript font file (PFA or PFB)
my $d2;
$data .= $d2 if $raf->Read($d2,12);
if ($data =~ /^%!(PS-(AdobeFont-|Bitstream )|FontType1-)/) {
$et->SetFileType('PFA'); # PostScript ASCII font file
$fontTable = GetTagTable('Image::ExifTool::Font::PSInfo');
# PostScript font files may contain an unformatted comments which may
# contain useful information, so accumulate these for the Comment tag
$comment = 1;
}
$raf->Seek(-length($data), 1);
}
#
# set the newline type based on the first newline found in the file
#
local $/ = GetInputRecordSeparator($raf);
$/ or return PSErr($et, 'invalid PS data');
# set file type (PostScript or EPS)
$raf->ReadLine($data) or $data = '';
my $type;
if ($data =~ /EPSF/) {
$type = 'EPS';
} else {
# read next line to see if this is an Illustrator file
my $line2;
my $pos = $raf->Tell();
if ($raf->ReadLine($line2) and $line2 =~ /^%%Creator: Adobe Illustrator/) {
$type = 'AI';
} else {
$type = 'PS';
}
$raf->Seek($pos, 0);
}
$et->SetFileType($type);
return 1 if $$et{OPTIONS}{FastScan} and $$et{OPTIONS}{FastScan} == 3;
#
# extract TIFF information from DOS header
#
$tagTablePtr or $tagTablePtr = GetTagTable('Image::ExifTool::PostScript::Main');
if ($dos) {
my $base = Get32u(\$dos, 16);
if ($base) {
my $pos = $raf->Tell();
# extract the TIFF preview
my $len = Get32u(\$dos, 20);
my $val = $et->ExtractBinary($base, $len, 'TIFFPreview');
if (defined $val and $val =~ /^(MM\0\x2a|II\x2a\0|Binary)/) {
$et->HandleTag($tagTablePtr, 'TIFFPreview', $val);
} else {
$et->Warn('Bad TIFF preview image');
}
# extract information from TIFF in DOS header
# (set Parent to '' to avoid setting FileType tag again)
my %dirInfo = (
Parent => '',
RAF => $raf,
Base => $base,
);
$et->ProcessTIFF(\%dirInfo) or $et->Warn('Bad embedded TIFF');
# position file pointer to extract PS information
$raf->Seek($pos, 0);
}
}
#
# parse the postscript
#
my ($buff, $mode, $beginToken, $endToken, $docNum, $subDocNum, $changedNL);
my (@lines, $altnl);
if ($/ eq "\x0d") {
$altnl = "\x0a";
} else {
$/ = "\x0a"; # end on any LF (even if DOS CR+LF)
$altnl = "\x0d";
}
for (;;) {
if (@lines) {
$data = shift @lines;
} else {
$raf->ReadLine($data) or last;
# check for alternate newlines as efficiently as possible
if ($data =~ /$altnl/) {
if (length($data) > 500000 and IsPC()) {
# Windows can't split very long lines due to poor memory handling,
# so re-read the file with the other newline character instead
# (slower but uses less memory)
unless ($changedNL) {
$changedNL = 1;
my $t = $/;
$/ = $altnl;
$altnl = $t;
$raf->Seek(-length($data), 1);
next;
}
} else {
# split into separate lines
@lines = split /$altnl/, $data, -1;
$data = shift @lines;
if (@lines == 1 and $lines[0] eq $/) {
# handle case of DOS newline data inside file using Unix newlines
$data .= $lines[0];
undef @lines;
}
}
}
}
undef $changedNL;
if ($mode) {
if (not $endToken) {
$buff .= $data;
next unless $data =~ m{<\?xpacket end=.(w|r).\?>($/|$)};
} elsif ($data !~ /^$endToken/i) {
if ($mode eq 'XMP') {
$buff .= $data;
} elsif ($mode eq 'Document') {
# ignore embedded documents, but keep track of nesting level
$docNum .= '-1' if $data =~ /^$beginToken/;
} else {
# data is ASCII-hex encoded
$data =~ tr/0-9A-Fa-f//dc; # remove all but hex characters
$buff .= pack('H*', $data); # translate from hex
}
next;
} elsif ($mode eq 'Document') {
$docNum =~ s/-?\d+$//; # decrement document nesting level
# done with Document mode if we are back at the top level
undef $mode unless $docNum;
next;
}
} elsif ($endDoc and $data =~ /^$endDoc/i) {
$docNum =~ s/-?(\d+)$//; # decrement nesting level
$subDocNum = $1; # remember our last sub-document number
$$et{DOC_NUM} = $docNum;
undef $endDoc unless $docNum; # done with document if top level
next;
} elsif ($data =~ /^(%{1,2})(Begin)(_xml_packet|Photoshop|ICCProfile|Document|Binary)/i) {
# the beginning of a data block
my %modeLookup = (
_xml_packet => 'XMP',
photoshop => 'Photoshop',
iccprofile => 'ICC_Profile',
document => 'Document',
binary => undef, # (we will try to skip this)
);
$mode = $modeLookup{lc $3};
unless ($mode) {
if (not @lines and $data =~ /^%{1,2}BeginBinary:\s*(\d+)/i) {
$raf->Seek($1, 1) or last; # skip binary data
}
next;
}
$buff = '';
$beginToken = $1 . $2 . $3;
$endToken = $1 . ($2 eq 'begin' ? 'end' : 'End') . $3;
if ($mode eq 'Document') {
# this is either the 1st sub-document or Nth document
if ($docNum) {
# increase nesting level
$docNum .= '-' . (++$subDocNum);
} else {
# this is the Nth document
$docNum = $$et{DOC_COUNT} + 1;
}
$subDocNum = 0; # new level, so reset subDocNum
next unless $embedded; # skip over this document
# set document number for family 4-7 group names
$$et{DOC_NUM} = $docNum;
$$et{LIST_TAGS} = { }; # don't build lists across different documents
$$et{PROCESSED} = { }; # re-initialize processed directory lookup too
$endDoc = $endToken; # parse to EndDocument token
# reset mode to allow parsing into sub-directories
undef $endToken;
undef $mode;
# save document name if available
if ($data =~ /^$beginToken:\s+([^\n\r]+)/i) {
my $docName = $1;
# remove brackets if necessary
$docName = $1 if $docName =~ /^\((.*)\)$/;
$et->HandleTag($tagTablePtr, 'EmbeddedFileName', $docName);
}
}
next;
} elsif ($data =~ /^<\?xpacket begin=.{7,13}W5M0MpCehiHzreSzNTczkc9d/) {
# pick up any stray XMP data
$mode = 'XMP';
$buff = $data;
undef $endToken; # no end token (just look for xpacket end)
# XMP could be contained in a single line (if newlines are different)
next unless $data =~ m{<\?xpacket end=.(w|r).\?>($/|$)};
} elsif ($data =~ /^%%?(\w+): ?(.*)/s and $$tagTablePtr{$1}) {
my ($tag, $val) = ($1, $2);
# only allow 'ImageData' to have single leading '%'
next unless $data =~ /^%%/ or $1 eq 'ImageData';
# decode comment string (reading continuation lines if necessary)
$val = DecodeComment($val, $raf, \@lines);
$et->HandleTag($tagTablePtr, $tag, $val);
next;
} elsif ($embedded and $data =~ /^%AI12_CompressedData/) {
# the rest of the file is compressed
unless (eval { require Compress::Zlib }) {
$et->Warn('Install Compress::Zlib to extract compressed embedded data');
last;
}
# seek back to find the start of the compressed data in the file
my $tlen = length($data) + @lines;
$tlen += length $_ foreach @lines;
my $backTo = $raf->Tell() - $tlen - 64;
$backTo = 0 if $backTo < 0;
last unless $raf->Seek($backTo, 0) and $raf->Read($data, 2048);
last unless $data =~ s/.*?%AI12_CompressedData//;
my $inflate = Compress::Zlib::inflateInit();
$inflate or $et->Warn('Error initializing inflate'), last;
# generate a PS-like file in memory from the compressed data
my $verbose = $et->Options('Verbose');
if ($verbose > 1) {
$et->VerboseDir('AI12_CompressedData (first 4kB)');
$et->VerboseDump(\$data);
}
# remove header if it exists (Windows AI files only)
$data =~ s/^.{0,256}EndData[\x0d\x0a]+//s;
my $val;
for (;;) {
my ($v2, $stat) = $inflate->inflate($data);
$stat == Compress::Zlib::Z_STREAM_END() and $val .= $v2, last;
$stat != Compress::Zlib::Z_OK() and undef($val), last;
if (defined $val) {
$val .= $v2;
} elsif ($v2 =~ /^%!PS/) {
$val = $v2;
} else {
# add postscript header (for file recognition) if it doesn't exist
$val = "%!PS-Adobe-3.0$/" . $v2;
}
$raf->Read($data, 65536) or last;
}
defined $val or $et->Warn('Error inflating AI compressed data'), last;
if ($verbose > 1) {
$et->VerboseDir('Uncompressed AI12 Data');
$et->VerboseDump(\$val);
}
# extract information from embedded images in the uncompressed data
$val = # add PS header in case it needs one
ProcessPS($et, { DataPt => \$val });
last;
} elsif ($fontTable) {
if (defined $comment) {
# extract initial comments from PostScript Font files
if ($data =~ /^%\s+(.*?)[\x0d\x0a]/) {
$comment .= "\n" if $comment;
$comment .= $1;
next;
} elsif ($data !~ /^%/) {
# stop extracting comments at the first non-comment line
$et->FoundTag('Comment', $comment) if length $comment;
undef $comment;
}
}
if ($data =~ m{^\s*/(\w+)\s*(.*)} and $$fontTable{$1}) {
my ($tag, $val) = ($1, $2);
if ($val =~ /^\((.*)\)/) {
$val = UnescapePostScript($1);
} elsif ($val =~ m{/?(\S+)}) {
$val = $1;
}
$et->HandleTag($fontTable, $tag, $val);
} elsif ($data =~ /^currentdict end/) {
# only extract tags from initial FontInfo dict
undef $fontTable;
}
next;
} else {
next;
}
# extract information from buffered data
my %dirInfo = (
DataPt => \$buff,
DataLen => length $buff,
DirStart => 0,
DirLen => length $buff,
Parent => 'PostScript',
);
my $subTablePtr = GetTagTable("Image::ExifTool::${mode}::Main");
unless ($et->ProcessDirectory(\%dirInfo, $subTablePtr)) {
$et->Warn("Error processing $mode information in PostScript file");
}
undef $buff;
undef $mode;
}
$mode = 'Document' if $endDoc and not $mode;
$mode and PSErr($et, "unterminated $mode data");
return 1;
}
#------------------------------------------------------------------------------
# Extract information from EPS file
# Inputs: 0) ExifTool object reference, 1) dirInfo reference
# Returns: 1 if this was a valid PostScript file
sub ProcessEPS($$)
{
return ProcessPS($_[0],$_[1]);
}
1; # end
__END__
=head1 NAME
Image::ExifTool::PostScript - Read PostScript meta information
=head1 SYNOPSIS
This module is loaded automatically by Image::ExifTool when required.
=head1 DESCRIPTION
This code reads meta information from EPS (Encapsulated PostScript), PS
(PostScript) and AI (Adobe Illustrator) files.
=head1 AUTHOR
Copyright 2003-2016, Phil Harvey (phil at owl.phy.queensu.ca)
This library is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
=head1 REFERENCES
=over 4
=item L<http://partners.adobe.com/public/developer/en/ps/5002.EPSF_Spec.pdf>
=item L<http://partners.adobe.com/public/developer/en/ps/5001.DSC_Spec.pdf>
=item L<http://partners.adobe.com/public/developer/en/illustrator/sdk/AI7FileFormat.pdf>
=back
=head1 SEE ALSO
L<Image::ExifTool::TagNames/PostScript Tags>,
L<Image::ExifTool(3pm)|Image::ExifTool>
=cut