// Copyright 1997-2000 Omni Development, Inc.  All rights reserved.
//
// This software may only be used and reproduced according to the
// terms in the file OmniSourceLicense.html, which should be
// distributed with this project and can also be found at
// http://www.omnigroup.com/DeveloperResources/OmniSourceLicense.html.

#define DEBUG

#import "OWHTMLToSGMLObjects.h"

#import <Foundation/Foundation.h>
#import <OmniBase/OmniBase.h>
#import <OmniFoundation/OmniFoundation.h>

#import "NSString-OWSGMLString.h"
#import "OWSGMLTag.h"
#import "OWSGMLTagType.h"
#import "OWSGMLAttribute.h"
#import "OWSGMLDTD.h"
#import "OWObjectStream.h"
#import "OWDataStream.h"
#import "OWDataStreamCharacterCursor.h"
#import "OWDataStreamScanner.h"
#import "OWParameterizedContentType.h"
#import "OWPipeline.h"

RCS_ID("$Header: /Network/Source/CVS/OmniGroup/Frameworks/OWF/Processors.subproj/SGML.subproj/OWHTMLToSGMLObjects.m,v 1.42 2000/12/20 23:02:02 wiml Exp $")

@interface OWDataStreamScanner (OWHTMLScanning)
- (NSString *)readFragmentUpToLeftAngleBracketOrAmpersand;
- (NSString *)readFragmentUpThroughCharacters:(unichar *)terminator alternative:(unichar *)alterminator length:(int)terminatorLength seen:(int *)terminatorSeen;
@end

@interface OWHTMLToSGMLObjects (Private)
- (void)scanContent;
- (void)propagateStringEncoding;
- (void)scanTag;
- (void)scanBeginTag;
- (NSString *)readValueWithDelimiterCSBitmap:(CSBitmap)delimiterCSBitmap newlinesAreDelimiters:(BOOL)newlinesAreDelimiters;
- (void)skipValueWithDelimiterCSBitmap:(CSBitmap)delimiterCSBitmap newlinesAreDelimiters:(BOOL)newlinesAreDelimiters;
- (void)scanEndTag;
- (void)scanMarkupDeclaration;
- (void)scanComment;
- (void)scanProcessingInstruction;
- (id <OWSGMLToken>)readEntity;
- (id <OWSGMLToken>)readCharacterReference;
- (id <OWSGMLToken>)readEntityReference;
- (unsigned int)readNumber;
- (unsigned int)readHexNumber;
- (void)skipToEndOfTag;
- (void)scanNonSGMLContent:(OWSGMLTag *)nonSGMLTag interpretEntities:(BOOL)shouldInterpretEntities;
- (void)metaCharsetTagHack:(OWSGMLTag *)tag;
@end

@implementation OWHTMLToSGMLObjects

// static Class stringDecoderClass;

NSLock *decoderDefaultsLock = nil;
 
static NSDictionary *entityDictionary;
static NSMutableDictionary *stringEntityDictionary;
static NSDictionary *entityNameDictionary;

// abstract syntax

static NSCharacterSet *DigitSet;
static NSCharacterSet *InvertedDigitSet;
static NSMutableCharacterSet *InvertedHexDigitSet;
static NSCharacterSet *LCLetterSet;
static NSCharacterSet *UCLetterSet;
static NSCharacterSet *SpecialSet;

// concrete syntax

static NSCharacterSet *LCNameCharSet;
static NSCharacterSet *RecordEndSet;
static NSCharacterSet *RecordStartSet;
static NSCharacterSet *SepCharSet;
static NSCharacterSet *SpaceSet;
static NSCharacterSet *UCNameCharSet;

// categories

static NSMutableCharacterSet *NameStartCharacterSet;
static NSMutableCharacterSet *InvertedNameCharacterSet;
static NSMutableCharacterSet *BlankSpaceSet;
static NSCharacterSet *InvertedBlankSpaceSet;
static NSMutableCharacterSet *CREFSet;

// made up

static NSCharacterSet *CommentEndSet;
static NSCharacterSet *ContentEndSet;
static NSCharacterSet *EndQuotedValueSet;
static NSCharacterSet *EndSingleQuotedValueSet;
static NSCharacterSet *EndTagSet;
static NSMutableCharacterSet *EndValueSet;
static NSMutableCharacterSet *TagEndOrNameStartCharacterSet;

// bitmaps

static CSBitmap CREFCSBitmap;
static CSBitmap CommentEndCSBitmap;
static CSBitmap DigitCSBitmap;
static CSBitmap EndQuotedValueCSBitmap;
static CSBitmap EndSingleQuotedValueCSBitmap;
static CSBitmap EndTagCSBitmap;
static CSBitmap EndValueCSBitmap;
static CSBitmap InvertedBlankSpaceCSBitmap;
static CSBitmap InvertedDigitCSBitmap;
static CSBitmap InvertedHexDigitCSBitmap;
static CSBitmap InvertedNameCSBitmap;
static CSBitmap NameStartCSBitmap;
static CSBitmap TagEndOrNameStartCSBitmap;


+ (void)initialize;
{
    static BOOL initialized = NO;
    NSDictionary *characterDictionary;
    NSEnumerator *characterKeyEnumerator;
    NSString *name, *value;
    NSAutoreleasePool *pool;
    NSMutableDictionary *inverseEntities;

    [super initialize];
    if (initialized)
        return;
    initialized = YES;

    pool = [[NSAutoreleasePool alloc] init];

    entityDictionary = [[NSDictionary alloc] initWithContentsOfFile:[[NSBundle bundleForClass:self] pathForResource:@"entities" ofType:@"plist"]];
    stringEntityDictionary = [[entityDictionary objectForKey:@"strings"] mutableCopy];
    characterDictionary = [entityDictionary objectForKey:@"character"];
    characterKeyEnumerator = [characterDictionary keyEnumerator];
    while ((name = [characterKeyEnumerator nextObject])) {
	unichar character;

	value = [characterDictionary objectForKey:name];
	character = [value intValue];
	value = [NSString stringWithCharacters:&character length:1];
	[stringEntityDictionary setObject:value forKey:name];
    }
    
    inverseEntities = [[NSMutableDictionary alloc] initWithCapacity:[stringEntityDictionary count]];
    characterKeyEnumerator = [stringEntityDictionary keyEnumerator];
    while ((name = [characterKeyEnumerator nextObject]) != nil) {
        value = [stringEntityDictionary objectForKey:name];
        if ([value length] == 1)
            [inverseEntities setObject:name forKey:value];
    }
    entityNameDictionary = [inverseEntities copy];
    [inverseEntities release];
    
    if (!decoderDefaultsLock)
        decoderDefaultsLock = [[NSLock alloc] init];

// abstract syntax

    DigitSet = [[NSCharacterSet decimalDigitCharacterSet] retain];
    InvertedDigitSet = [[DigitSet invertedSet] retain];

    InvertedHexDigitSet = [[NSMutableCharacterSet alloc] init];
    [InvertedHexDigitSet formUnionWithCharacterSet:DigitSet];
    [InvertedHexDigitSet addCharactersInString:@"abcdefABCDEF"];
    [InvertedHexDigitSet invert];
    
    LCLetterSet = [[NSCharacterSet lowercaseLetterCharacterSet] retain];
    UCLetterSet = [[NSCharacterSet uppercaseLetterCharacterSet] retain];
    SpecialSet = [[NSCharacterSet characterSetWithCharactersInString:@"'()+,-./:=?"] retain];

// concrete syntax

    LCNameCharSet = [[NSCharacterSet characterSetWithCharactersInString:@"-."] retain];
    RecordEndSet = [[NSCharacterSet characterSetWithCharactersInString:@"\n"] retain];
    RecordStartSet = [[NSCharacterSet characterSetWithCharactersInString:@"\r"] retain];
    SepCharSet = [[NSCharacterSet characterSetWithCharactersInString:@"\t"] retain];
    SpaceSet = [[NSCharacterSet characterSetWithCharactersInString:@" "] retain];
    UCNameCharSet = [[NSCharacterSet characterSetWithCharactersInString:@"-."] retain];

// categories

    NameStartCharacterSet = [[NSMutableCharacterSet alloc] init];
    [NameStartCharacterSet formUnionWithCharacterSet:LCLetterSet];
    [NameStartCharacterSet formUnionWithCharacterSet:UCLetterSet];

    InvertedNameCharacterSet = [[NSMutableCharacterSet alloc] init];
    [InvertedNameCharacterSet formUnionWithCharacterSet:NameStartCharacterSet];
    [InvertedNameCharacterSet formUnionWithCharacterSet:DigitSet];
    [InvertedNameCharacterSet formUnionWithCharacterSet:LCNameCharSet];
    [InvertedNameCharacterSet formUnionWithCharacterSet:UCNameCharSet];
    [InvertedNameCharacterSet invert];

    BlankSpaceSet = [[NSMutableCharacterSet alloc] init];
    [BlankSpaceSet formUnionWithCharacterSet:SpaceSet];
    [BlankSpaceSet formUnionWithCharacterSet:RecordEndSet];
    [BlankSpaceSet formUnionWithCharacterSet:RecordStartSet];
    [BlankSpaceSet formUnionWithCharacterSet:SepCharSet];
    InvertedBlankSpaceSet = [[BlankSpaceSet invertedSet] retain];

    CREFSet = [[NSMutableCharacterSet alloc] init];
    [CREFSet formUnionWithCharacterSet:DigitSet];
    [CREFSet addCharactersInString:@"xX"]; // SGML allows others, HTML does not

// made up

    CommentEndSet = [[NSCharacterSet characterSetWithCharactersInString:@"-"] retain];
    ContentEndSet = [[NSCharacterSet characterSetWithCharactersInString:@"<&"] retain];
    EndQuotedValueSet = [[NSCharacterSet characterSetWithCharactersInString:@"&\"\r\n"] retain];
    EndSingleQuotedValueSet = [[NSCharacterSet characterSetWithCharactersInString:@"&'\r\n"] retain];

    EndValueSet = [BlankSpaceSet mutableCopy];
    [EndValueSet addCharactersInString:@"&>"];

    EndTagSet = [[NSCharacterSet characterSetWithCharactersInString:@">'\""] retain];

    TagEndOrNameStartCharacterSet = [NameStartCharacterSet mutableCopy];
    [TagEndOrNameStartCharacterSet addCharactersInString:@">"];

    // Setup bitmaps

#define bitmapForSet(aSet) bitmapForCharacterSetDoRetain(aSet, YES);
    
    CommentEndCSBitmap = bitmapForSet(CommentEndSet);
    CREFCSBitmap = bitmapForSet(CREFSet);
    DigitCSBitmap = bitmapForSet(DigitSet);
    EndQuotedValueCSBitmap = bitmapForSet(EndQuotedValueSet);
    EndSingleQuotedValueCSBitmap = bitmapForSet(EndSingleQuotedValueSet);
    EndTagCSBitmap = bitmapForSet(EndTagSet);
    EndValueCSBitmap = bitmapForSet(EndValueSet);
    InvertedBlankSpaceCSBitmap = bitmapForSet(InvertedBlankSpaceSet);
    InvertedDigitCSBitmap = bitmapForSet(InvertedDigitSet);
    InvertedHexDigitCSBitmap = bitmapForSet(InvertedHexDigitSet);
    InvertedNameCSBitmap = bitmapForSet(InvertedNameCharacterSet);
    NameStartCSBitmap = bitmapForSet(NameStartCharacterSet);
    TagEndOrNameStartCSBitmap = bitmapForSet(TagEndOrNameStartCharacterSet);
    EndQuotedValueCSBitmap = bitmapForSet(EndQuotedValueSet);
    EndSingleQuotedValueCSBitmap = bitmapForSet(EndSingleQuotedValueSet);
    EndTagCSBitmap = bitmapForSet(EndTagSet);

    [pool release];
}

+ (void)registerItemName:(NSString *)itemName bundle:(NSBundle *)bundle description:(NSDictionary *)description;
{
    // Register entity lists here
}

#if 0
+ (id <OWStringDecoder>)stringDecoder;
{
    NSString *incomingStringEncodingDefault;
    NSString *decoderMode = nil;

    if (!stringDecoderClass)
        return nil;
    
    // Note that this default can be either a string or an integer, depending on whether we have a class or just an encoding
    incomingStringEncodingDefault = [[NSUserDefaults standardUserDefaults] stringForKey:@"OWIncomingStringEncoding"];
    if ([incomingStringEncodingDefault intValue] != 0)
        return nil; // We have a valid +stringEncoding, as processed below

    if (![incomingStringEncodingDefault hasPrefix:[[stringDecoderClass description] stringByAppendingString:@" "]])
        return nil;

    decoderMode = [incomingStringEncodingDefault substringFromIndex:[[stringDecoderClass description] length] + 1];
    if (!decoderMode || ![[stringDecoderClass filterModes] objectForKey:decoderMode])
        return nil;

    return [[[stringDecoderClass alloc] initWithMode:decoderMode] autorelease];
}

+ (void)setStringDecoderClass:(Class)aClass;
{
    stringDecoderClass = aClass;
}

+ (Class)stringDecoderClass;
{
    return stringDecoderClass;
}
#endif

+ (BOOL)recognizesEntityNamed:(NSString *)entityName;
{
    return [stringEntityDictionary objectForKey:entityName] != nil;
}

+ (NSString *)entityNameForCharacter:(unichar)character;
{
    // TODO someday: use a map table here instead of requiring us to create these temporary 1-character strings?
    NSString *key, *name;
    unichar buffer[1];
    
    buffer[0] = character;
    
    key = [[NSString alloc] initWithCharacters:buffer length:1];
    name = [entityNameDictionary objectForKey:key];
    [key release];
    return name;
}

// Init and dealloc

#warning Hey, guys, this isn't actually an init method!
- initWithDataStreamCursor:(OWDataStreamCharacterCursor *)aCharacterCursor forDTD:(OWSGMLDTD *)aDTD;
{
    NSUserDefaults *userDefaults;

    userDefaults = [NSUserDefaults standardUserDefaults];
    flags.netscapeCompatibleComments = [userDefaults boolForKey:@"OWHTMLNetscapeCompatibleComments"];
    flags.netscapeCompatibleNewlineAfterEntity = [userDefaults boolForKey:@"OWHTMLNetscapeCompatibleNewlineAfterEntity"];
    flags.netscapeCompatibleNonterminatedEntities = [userDefaults boolForKey:@"OWHTMLNetscapeCompatibleNonterminatedEntities"];

    if (characterCursor != aCharacterCursor) {
        if (characterCursor)
            [characterCursor release];
        characterCursor = [aCharacterCursor retain];
    }
    objectStream = [[OWObjectStream alloc] init];
    tagTrie = [aDTD tagTrie];
    [objectStream setContentType:[aDTD destinationType]];
    scanner = [[OWDataStreamScanner alloc] initWithCursor:characterCursor];
    [pipeline addContent:objectStream];
    [pipeline cacheContent];
    
    [pipeline startProcessingContent];
    
    if ([userDefaults boolForKey:@"OWHTMLCharsetInMetaTag"]) {
        // these aren't retained because they're never deallocated
        metaCharsetHackTagType = [aDTD tagTypeNamed:@"meta"];
        endMetaCharsetHackTagType = [aDTD tagTypeNamed:@"body"];
    } else {
        metaCharsetHackTagType = nil;
        endMetaCharsetHackTagType = nil;
    }
    
    return self;
}

- initWithDataStreamCursor:(OWDataStreamCharacterCursor *)aCharacterCursor;
{
    OWSGMLDTD *sourceContentDTD;

    sourceContentDTD = [OWSGMLDTD dtdForSourceContentType:[[aCharacterCursor dataStream] encodedContentType]];
    return [self initWithDataStreamCursor:aCharacterCursor forDTD:sourceContentDTD];
}

- initWithPipeline:(OWPipeline *)aPipeline;
{
    if (![super initWithPipeline:aPipeline])
        return nil;
    return [self initWithDataStreamCursor:characterCursor];
}

- (void)dealloc;
{
    [objectStream release];
    [scanner release];
    [super dealloc];
}

//

- (OWObjectStream *)outputStream;
{
    return objectStream;
}

// OWProcessor subclass

- (void)process;
{
    [self propagateStringEncoding];
    [self scanContent];
}

- (void)processAbort;
{
    [objectStream dataAbort];
    [super processAbort];
}

// Debugging

- (NSMutableDictionary *)debugDictionary;
{
    NSMutableDictionary *debugDictionary;

    debugDictionary = [super debugDictionary];
    if (objectStream)
	[debugDictionary setObject:objectStream forKey:@"objectStream"];
    if (scanner)
	[debugDictionary setObject:scanner forKey:@"scanner"];
    return debugDictionary;
}

@end

@implementation OWHTMLToSGMLObjects (Private)

#ifdef DEBUG
static BOOL OWHTMLToSGMLObjectsDebug = NO;

+ (void)setDebug:(BOOL)newDebug;
{
    OWHTMLToSGMLObjectsDebug = newDebug;
}
#endif

- (void)scanContent;
{
    unichar peekCharacter;
    NSString *string;

    if (!scanner)
	return;
    while ((peekCharacter = scannerPeekCharacter(scanner)) != OFCharacterScannerEndOfDataCharacter || scannerHasData(scanner) /* In case OFCharacterScannerEndOfDataCharacter is an actual character in the string */) {
        switch (peekCharacter) {
            case '<':
                scannerSkipPeekedCharacter(scanner);
                [self scanTag];
                break;
            case '&':
                scannerSkipPeekedCharacter(scanner);
                [objectStream writeObject:[self readEntity]];
                break;
            default:
                string = [scanner readFragmentUpToLeftAngleBracketOrAmpersand];
                [objectStream writeObject:string];
                break;
        }
    }

// #warning TODO: reimplement dominant-encoding algorithm
#if 0
    if ([dataCursor stringDecoder]) {
	NSStringEncoding dominantEncoding;

	dominantEncoding = [[dataCursor stringDecoder] dominantEncoding];
	if (dominantEncoding != 0 && dominantEncoding != ~(unsigned)0) {
	    [pipeline setContextObject:[NSNumber numberWithInt:dominantEncoding] forKey:@"dominantCharacterEncoding"];
	}
    }
#else
    [self propagateStringEncoding];
#endif

    [objectStream dataEnd];
}

// Even though we are sending NSStrings downstream, some later processors might want to know the string encoding of the original document, e.g. forms want to encode their responses in the same character set as the document they came from. We ask the data stream cursor so that we will properly pick up autodetected encodings and the like.
// (It's possible for the encoding to change midstream but we assume that that only happens near the beginning before we've parsed anything interesting, so there's no need for later processors to be able to figure out which of several encodings a particular object was transmitted in.)
- (void)propagateStringEncoding
{
    CFStringEncoding sourceEncoding = [[scanner dataStreamCursor] stringEncoding];
    if (sourceEncoding != kCFStringEncodingInvalidId) {
        NSNumber *oldContext = [pipeline contextObjectForKey:@"dominantCharacterEncoding"];
        if (!(oldContext && [oldContext unsignedIntValue] == sourceEncoding)) 
            [pipeline setContextObject:[NSNumber numberWithUnsignedInt:sourceEncoding] forKey:@"dominantCharacterEncoding"];
    }
}

- (void)scanTag;
{
    unichar peekCharacter;

    switch ((peekCharacter = scannerPeekCharacter(scanner))) {
        case '/':
            scannerSkipPeekedCharacter(scanner);
            [self scanEndTag];
            break;
        case '!':
            scannerSkipPeekedCharacter(scanner);
            [self scanMarkupDeclaration];
            break;
        case '?':
            scannerSkipPeekedCharacter(scanner);
            [self scanProcessingInstruction];
            break;
        default:
            if (characterIsMemberOfCSBitmap(NameStartCSBitmap, peekCharacter))
                [self scanBeginTag];
            else
                [objectStream writeObject:@"<"];
            break;
    }
}

- (void)scanBeginTag;
{
    OWSGMLTagType *tagType;
    OWSGMLTag *tag = nil;
    OFTrie *attributeTrie;
    OWSGMLAttribute *attribute;

    tagType = (OWSGMLTagType *)[scanner readLongestTrieElement:tagTrie];
    if (!tagType || !characterIsMemberOfCSBitmap(InvertedNameCSBitmap, scannerPeekCharacter(scanner))) {
	[self skipToEndOfTag];
	return;
    }

    attributeTrie = [tagType attributeTrie];
    
    while (scannerHasData(scanner)) {
	NSString *value;

	scannerScanUpToCharacterInCSBitmap(scanner, TagEndOrNameStartCSBitmap);
	if (scannerPeekCharacter(scanner) == '>') {
            scannerSkipPeekedCharacter(scanner);
	    break;
	}
                           
	attribute = (OWSGMLAttribute *)[scanner readLongestTrieElement:attributeTrie];

	if (attribute && characterIsMemberOfCSBitmap(InvertedNameCSBitmap, scannerPeekCharacter(scanner))) {
	    scannerScanUpToCharacterInCSBitmap(scanner, InvertedBlankSpaceCSBitmap);
	    if (scannerPeekCharacter(scanner) == '=') {
		unichar character;

		scannerSkipPeekedCharacter(scanner);
		scannerScanUpToCharacterInCSBitmap(scanner, InvertedBlankSpaceCSBitmap);

                switch ((character = scannerPeekCharacter(scanner))) {
                    case '"':
                    case '\'':
                        scannerSkipPeekedCharacter(scanner);
                        value = [self readValueWithDelimiterCSBitmap:(character == '"' ? EndQuotedValueCSBitmap : EndSingleQuotedValueCSBitmap) newlinesAreDelimiters:NO];
                        if (scannerPeekCharacter(scanner) != '>')
                            scannerSkipPeekedCharacter(scanner);
                            break;
                    default:
                        value = [self readValueWithDelimiterCSBitmap:EndValueCSBitmap newlinesAreDelimiters:YES];
                        break;
                }
	    } else {
		value = [OFNull nullStringObject];
	    }
            if (!tag)
                tag = [[OWSGMLTag alloc] initWithTokenType:OWSGMLTokenTypeStartTag tagType:tagType];
	    [tag setValue:value atIndex:[attribute offset]];
	} else {
            scannerScanUpToCharacterInCSBitmap(scanner, InvertedNameCSBitmap);
            scannerScanUpToCharacterInCSBitmap(scanner, InvertedBlankSpaceCSBitmap);
            if (scannerPeekCharacter(scanner) == '=') {
                unichar character;

                scannerSkipPeekedCharacter(scanner);
                scannerScanUpToCharacterInCSBitmap(scanner, InvertedBlankSpaceCSBitmap);

                switch ((character = scannerPeekCharacter(scanner))) {
                    case '"':
                    case '\'':
                        scannerSkipPeekedCharacter(scanner);
                        [self skipValueWithDelimiterCSBitmap:(character == '"' ? EndQuotedValueCSBitmap : EndSingleQuotedValueCSBitmap) newlinesAreDelimiters:NO];
                        if (scannerPeekCharacter(scanner) != '>')
                            scannerSkipPeekedCharacter(scanner);
                            break;
                    default:
                        [self skipValueWithDelimiterCSBitmap:EndValueCSBitmap newlinesAreDelimiters:NO];
                        break;
                }
            }
        }
    }

    if (!tag)
        tag = [tagType attributelessStartTag];
    [objectStream writeObject:tag];
#ifdef DEBUG
    if (OWHTMLToSGMLObjectsDebug)
	NSLog(@"Tag: %@", tag);
#endif

    // Ugly hack to support non-SGML tags such as <SCRIPT> and stylesheets
    if ([tagType contentIsNotValidSGML])
        [self scanNonSGMLContent:tag interpretEntities:NO];
        
    // Ugly hack to support changing charsets in mid-stream
    if (tagType == metaCharsetHackTagType) {
        [self metaCharsetTagHack:tag];
    } else if (tagType == endMetaCharsetHackTagType) {
        metaCharsetHackTagType = nil;
        endMetaCharsetHackTagType = nil;
    }

    if (tag != [tagType attributelessStartTag])
        [tag release];
}

- (void)metaCharsetTagHack:(OWSGMLTag *)tag
{
    NSString *httpEquiv;
    NSString *newContentTypeString;
    OWParameterizedContentType *newContentType;
    CFStringEncoding newEncoding;

    httpEquiv = [tag valueForAttribute:@"http-equiv"];
     if (httpEquiv == nil || [httpEquiv caseInsensitiveCompare:@"content-type"] != NSOrderedSame) 
        return;
        
    newContentTypeString = [tag valueForAttribute:@"content"];
    if (newContentTypeString == nil)
        return; // Ignore tag: no content attribute value
    newContentType = [OWParameterizedContentType contentTypeForString:[tag valueForAttribute:@"content"]];
    if (newContentType == nil)
        return; // Ignore tag: content type failed to parse
    newEncoding = [OWDataStreamCharacterProcessor stringEncodingForContentType:newContentType];
    if (newEncoding == kCFStringEncodingInvalidId)
        return; // Ignore tag: unrecognized encoding or no encoding specified
    
    NS_DURING {
        // These should not raise in the normal case, but if they do, the right thing to do is ignore it and continue on as we were
        [scanner discardReadahead];
        [[scanner dataStreamCursor] setCFStringEncoding:newEncoding];
    } NS_HANDLER {
        NSLog(@"Unable to change charset in midstream: %@", [localException description]);
    } NS_ENDHANDLER;
    
    [self propagateStringEncoding];
    metaCharsetHackTagType = nil;
    endMetaCharsetHackTagType = nil;
}

- (NSString *)readValueWithDelimiterCSBitmap:(CSBitmap)delimiterCSBitmap newlinesAreDelimiters:(BOOL)newlinesAreDelimiters;
{
    NSMutableString *mergedValue;
    NSString *value;
    BOOL stillLooking = YES;

    mergedValue = nil;
    value = nil;
    while (stillLooking) {
	id <OWSGMLToken> entityToken;
	unichar peekCharacter;
	    
	value = [scanner readFullTokenWithDelimiterCSBitmap:delimiterCSBitmap forceLowercase:NO];
	if (mergedValue)
	    [mergedValue appendString:value];

	peekCharacter = scannerPeekCharacter(scanner);
        switch (peekCharacter) {
            case '&':
                scannerSkipPeekedCharacter(scanner);
                entityToken = [self readEntity];
                if (!mergedValue)
                    mergedValue = [[value mutableCopy] autorelease];
                    [mergedValue appendString:[entityToken string]];
                break;
            case '\r':
            case '\n':
                if (newlinesAreDelimiters) {
                    stillLooking = NO;
                    break;
                }

                // True SGML would have us replace these with whitespace, but, for now, we just ignore them (ala Netscape 2.0).

                do {
                    scannerSkipPeekedCharacter(scanner);
                    peekCharacter = scannerPeekCharacter(scanner);
                } while (peekCharacter == '\r' || peekCharacter == '\n');
                break;
            default:
                stillLooking = NO;
                break;
        }
	if (stillLooking && !mergedValue)
	    mergedValue = [[value mutableCopy] autorelease];
    }
    
    if (mergedValue)
	return mergedValue;
    else
	return value;
}

- (void)skipValueWithDelimiterCSBitmap:(CSBitmap)delimiterCSBitmap newlinesAreDelimiters:(BOOL)newlinesAreDelimiters;
{
    BOOL stillLooking = YES;

    while (stillLooking) {
        id <OWSGMLToken> entityToken;
        unichar peekCharacter;

        scannerScanUpToCharacterInCSBitmap(scanner, delimiterCSBitmap);
        peekCharacter = scannerPeekCharacter(scanner);
        switch (peekCharacter) {
            case '&':
                scannerSkipPeekedCharacter(scanner);
                entityToken = [self readEntity];
                break;

            case '\r':
            case '\n':
                if (newlinesAreDelimiters) {
                    stillLooking = NO;
                    break;
                }

                do {
                    scannerSkipPeekedCharacter(scanner);
                    peekCharacter = scannerPeekCharacter(scanner);
                } while (peekCharacter == '\r' || peekCharacter == '\n');
                break;

            default:
                stillLooking = NO;
                break;
        }
    }
}

- (void)scanEndTag;
{
    OWSGMLTagType *tagType;

    if (!characterIsMemberOfCSBitmap(NameStartCSBitmap, scannerPeekCharacter(scanner))) {
        [objectStream writeObject:@"</"];
        return;
    }

    tagType = (OWSGMLTagType *)[scanner readLongestTrieElement:tagTrie];
    if (tagType && characterIsMemberOfCSBitmap(InvertedNameCSBitmap, scannerPeekCharacter(scanner))) {
        [objectStream writeObject:[tagType attributelessEndTag]];
#ifdef DEBUG
        if (OWHTMLToSGMLObjectsDebug)
            NSLog(@"Tag: %@", [tagType attributelessEndTag]);
#endif
    }
    [self skipToEndOfTag];
}

- (void)scanMarkupDeclaration;
{
    unichar character;

    character = scannerReadCharacter(scanner);
    if (character == '>') {
	// Empty declaration! We're done.
    } else if (character == '-' && scannerPeekCharacter(scanner) == '-') {
	scannerSkipPeekedCharacter(scanner);
	[self scanComment];
    } else if (flags.netscapeCompatibleComments ||
               characterIsMemberOfCSBitmap(NameStartCSBitmap, character)) {
	[self skipToEndOfTag];
    } else {
	// Not markup after all!
	[scanner skipCharacters:-1];
	[objectStream writeObject:@"<!"];
    }
}

- (void)scanComment;
{
    unichar character;

    [scanner setRewindMark];
    do {
        scannerScanUpToCharacterInCSBitmap(scanner, CommentEndCSBitmap);
	if (scannerReadCharacter(scanner) == '-' && scannerReadCharacter(scanner) == '-') {
	    while (scannerPeekCharacter(scanner) == '-')
		scannerSkipPeekedCharacter(scanner);
            scannerScanUpToCharacterInCSBitmap(scanner, InvertedBlankSpaceCSBitmap);
	    character = scannerPeekCharacter(scanner);
	    if (character == '>') {
                [scanner discardRewindMark];
		scannerSkipPeekedCharacter(scanner);
		return;
	    }
	}
    } while (scannerHasData(scanner));

    // Woops, not a proper SGML comment!  Let's try old-style HTML.
    [scanner rewindToMark];
    [self skipToEndOfTag];
}

- (void)scanProcessingInstruction; // ISO 8879 8
{
    if (scannerScanUpToCharacter(scanner, '>')) {
        scannerSkipPeekedCharacter(scanner);
    } else {
        // Not markup after all!
        [objectStream writeObject:@"<?"];
    }
}

- (id <OWSGMLToken>)readEntity;
{
    unichar character;
    
    character = scannerPeekCharacter(scanner);
    if (character == '#') {
	scannerSkipPeekedCharacter(scanner);
	return [self readCharacterReference];
    } else if (characterIsMemberOfCSBitmap(NameStartCSBitmap, character))
	return [self readEntityReference];
    else
	return @"&";
}

- (id <OWSGMLToken>)readCharacterReference;
{
    NSString *value;
    unichar character;

    character = scannerPeekCharacter(scanner);
    if (!characterIsMemberOfCSBitmap(CREFCSBitmap, character))
	return @"&#";

    if (characterIsMemberOfCSBitmap(DigitCSBitmap, character)) {
	character = [self readNumber];
    } else { // character is 'x'
        scannerSkipPeekedCharacter(scanner);
        character = [self readHexNumber];
   }

    // WJS: 5/19/98 Even though the upper control characters aren't mapped in ISO Latin-1, they work in Netscape and Windows, so we check for that range explicitly and interpret them as WindowsCP1252 characters.  Note that right now the NSString machinery turns 145 and 146 into single-quotes that don't draw very well in NSText (they aren't mapped to the right glyphs in their font), so we special-case those two and turn them into quotes that draw nicely.
    // WIML July2000: Change this to use the new functions in OmniFoundation
    if (character == 145)
        character = 96; // unicode GRAVE ACCENT
    else if (character == 146)
        character = 39; // unicode APOSTROPHE

    if (character > 0x7e && character < 0xa0) {
        unsigned char byte;
        NSData *data;

        byte = character & 0xff;
        data = [[NSData alloc] initWithBytes:&byte length:1];
        value = [[[NSString alloc] initWithData:data encoding:NSWindowsCP1252StringEncoding] autorelease];
        [data release];
    } else {
        value = [NSString stringWithCharacters:&character length:1];
    }

    character = scannerPeekCharacter(scanner);
    if (character == ';' || (!flags.netscapeCompatibleNewlineAfterEntity && character == '\n'))
	scannerSkipPeekedCharacter(scanner);
    return value;
}

- (id <OWSGMLToken>)readEntityReference;
{
    NSString *name, *value;
    unsigned int nameLength;

    name = [scanner readFullTokenWithDelimiterCSBitmap:InvertedNameCSBitmap forceLowercase:NO];
    nameLength = [name length];
    if (nameLength == 0)
        return @"&";

    value = [stringEntityDictionary objectForKey:name];
    if (value) {
        unichar character;

        character = scannerPeekCharacter(scanner);
        if (character == ';' || (character == '\n' && !flags.netscapeCompatibleNewlineAfterEntity))
            scannerSkipPeekedCharacter(scanner);
        return value;
    } else {
	if (flags.netscapeCompatibleNonterminatedEntities) {
	    unsigned int tryLength;

	    for (tryLength = nameLength - 1; tryLength > 0; tryLength--) {
		value = [stringEntityDictionary objectForKey:[name substringToIndex:tryLength]];
		if (value) {
		    [scanner skipCharacters:-(int)(nameLength - tryLength)];
		    return value;
		}
	    }
	}
	return [NSString stringWithFormat:@"&%@", name];
    }
}

- (unsigned int)readNumber;
{
    return [[scanner readFullTokenWithDelimiterCSBitmap:InvertedDigitCSBitmap forceLowercase:NO] intValue];
}

- (unsigned int)readHexNumber;
{
    return [[scanner readFullTokenWithDelimiterCSBitmap:InvertedHexDigitCSBitmap forceLowercase:NO] hexValue];
}

- (void)skipToEndOfTag;
{
    [scanner setRewindMark];
    for (;;) {
        unichar character;

        if (!scannerScanUpToCharacterInCSBitmap(scanner, EndTagCSBitmap))
            break; // abort

        character = scannerReadCharacter(scanner);
        switch (character) {
            case '>':
                [scanner discardRewindMark];
                return; // success
            default:
                // find matching quote
                if (scannerScanUpToCharacter(scanner, character))
                    scannerSkipPeekedCharacter(scanner);
                break;
        }
    }

    // Fine, I give up!
    [scanner rewindToMark];
}

- (void)scanNonSGMLContent:(OWSGMLTag *)nonSGMLTag interpretEntities:(BOOL)shouldInterpretEntities;
{
    unichar *uppercaseEndTag, *lowercaseEndTag;
    unsigned int endTagLength;
    unsigned int endTagConsumed;
    NSString *endTagString;
    OWSGMLTag *endTag;
    NSString *fragment, *deferredFragment;

// #warning -scanNonSGMLContent: should really scan NSData
    // Non-SGML content is probably best interpreted as raw bytes instead of whatever character encoding the document happens to be in.  However, AFAIK JavaScript doesn't handle non-ASCII characters so this isn't immediately urgent.
    // 16Oct2000 wiml: I'm no longer convinced this should scan bytes instead of characters...

    if (!scanner)
        return;
        
#define TWEEDLEDEE

#ifdef TWEEDLEDEE
    endTagString = [NSString stringWithFormat:@"</%@>", [[nonSGMLTag name] lowercaseString]];
#else
    endTagString = [NSString stringWithFormat:@"</%@", [[nonSGMLTag name] lowercaseString]];
#endif
    endTagLength = [endTagString length];
    OBASSERT(endTagLength < 1024); // tag names longer than 1k?!?
    lowercaseEndTag = alloca(endTagLength * sizeof(unichar));
    [endTagString getCharacters:lowercaseEndTag];
    endTagString = [endTagString uppercaseString];
    OBASSERT(endTagLength == [endTagString length]);
    uppercaseEndTag = alloca(endTagLength * sizeof(unichar));
    [endTagString getCharacters:uppercaseEndTag];

    endTagConsumed = 0;
    deferredFragment = nil;
    while (endTagConsumed < endTagLength && scannerHasData(scanner)) {
        fragment = [scanner readFragmentUpThroughCharacters:lowercaseEndTag alternative:uppercaseEndTag length:endTagLength seen:&endTagConsumed];

        // If endTagConsumed is > 0, then the last few characters of "fragment" might be the beginning of the end tag, so we shouldn't pass them along to the output stream. But they might not be, so we have to keep them around in deferredFragment and pass them along if, on the next iteration, it turns out to have been a false alarm.
        if (deferredFragment) {
            if ([fragment length] > endTagConsumed) {
                [objectStream writeObject:deferredFragment];
                [deferredFragment release];
                deferredFragment = nil;
            } else {
                fragment = [deferredFragment stringByAppendingString:fragment];
                [deferredFragment release];
                deferredFragment = nil;
            }
        }

        if (endTagConsumed > 0) {
            unsigned int possiblyTheBeginningOfTheEnd;

            possiblyTheBeginningOfTheEnd = [fragment length] - endTagConsumed;
            deferredFragment = [fragment substringFromIndex:possiblyTheBeginningOfTheEnd];
            [deferredFragment retain];
            fragment = [fragment substringToIndex:possiblyTheBeginningOfTheEnd];
        }

        [objectStream writeObject:fragment];
#ifdef DEBUG
        if (OWHTMLToSGMLObjectsDebug) {
            NSLog(@"NonSGMLFragment: %@", fragment);
            NSLog(@"NonSGML: length=%d consumed=%d", endTagLength, endTagConsumed);
        }
#endif
    }

    [deferredFragment release];

    
#ifndef TWEEDLEDEE
    [self skipToEndOfTag];
#endif

    // Create an end tag and write it to the stream (since we just skipped over the real one).
    endTag = [[OWSGMLTag alloc] initWithTokenType:OWSGMLTokenTypeEndTag tagType:[nonSGMLTag tagType]];
    [objectStream writeObject:endTag];
#ifdef DEBUG
    if (OWHTMLToSGMLObjectsDebug)
        NSLog(@"Tag: %@", endTag);
#endif
    [endTag release];
}

@end

@implementation OWDataStreamScanner (SpecialScanning)

- (NSString *)readFragmentUpToLeftAngleBracketOrAmpersand;
{
    unichar *startLocation;

    if (!scannerHasData(self))
        return nil;

    startLocation = scanLocation;
    while (scanLocation < scanEnd) {
        if (*scanLocation == '<' || *scanLocation == '&')
            break;
        scanLocation++;
    }
    return [NSString stringWithCharacters:startLocation length:scanLocation - startLocation];
}

// TODO: Ideally, want a version of this that returns an NSData. Semantics problematic though.  Also, this method interface sucks.
- (NSString *)readFragmentUpThroughCharacters:(unichar *)terminator alternative:(unichar *)alterminator length:(int)terminatorLength seen:(int *)terminatorSeen
{
    unichar *startLocation;
    int terminatorProgress;

    if (!scannerHasData(self))
        return nil;

    startLocation = scanLocation;

    terminatorProgress = *terminatorSeen;
    while (scanLocation < scanEnd && terminatorProgress < terminatorLength) {
        if (*scanLocation == terminator[terminatorProgress] ||
            *scanLocation == alterminator[terminatorProgress])
            terminatorProgress++;
        else
            terminatorProgress = 0;
        scanLocation++;
    }
    *terminatorSeen = terminatorProgress;

    return [NSString stringWithCharacters:startLocation length:scanLocation - startLocation];
}

@end
