// Copyright 2001 Omni Development, Inc.  All rights reserved.
//
// This software may only be used and reproduced according to the
// terms in the file OmniSourceLicense.html, which should be
// distributed with this project and can also be found at
// http://www.omnigroup.com/DeveloperResources/OmniSourceLicense.html.

#import "OWCSSTokenizer.h"

#import <Foundation/Foundation.h>
#import <OmniBase/OmniBase.h>
#import <OmniFoundation/OmniFoundation.h>

#import "OWCSSIdentifier.h"
#import "OWCSSNumber.h"

RCS_ID("$Header: /NetworkDisk/Source/CVS/OmniGroup/Frameworks/OWF/CSS.subproj/OWCSSTokenizer.m,v 1.20 2001/08/24 06:17:20 wjs Exp $");

@interface OWCSSTokenizer (Private)
- (OWCSSTokenType)_readRealNextToken:(id *)nextTokenPointer excludingIdentifiers:(BOOL)excludingIdentifiers excludingNumbers:(BOOL)excludingNumbers;
- (NSArray *)_readFunctionWithStartToken:(id)startToken;
- (NSString *)_readEscapedCharactersInsideOfQuotes:(BOOL)isInsideQuotes;
- (NSString *)_readQuotedString;
- (NSString *)_readEscapableStringWithSpecials:(OFCharacterSet *)specialCharactersSet inQuotes:(BOOL)readingInsideQuotes;
@end

@implementation OWCSSTokenizer

static NSString *cdoString = @"<!--";
static NSString *cdcString = @"-->";
static NSString *commentStart = @"/*";
static NSString *commentEnd = @"*/";

static OFCharacterSet *punctuationCharacterSet;
static OFCharacterSet *whitespaceCharacterSet;
static OFCharacterSet *digitCharacterSet;
static OFCharacterSet *hexadecimalCharacterSet;

static OFCharacterSet *identifierStartCharacterSet;
static OFCharacterSet *invertedNameCharacterSet;

static OFCharacterSet *backslashQuoteCharacterSet;
static OFCharacterSet *backslashSingleQuoteValueCharacterSet;
static OFCharacterSet *whitespaceBackslashClosingParenthesisCharacterSet;
static OFCharacterSet *invertedCommaRightParenthesisCharacterSet;

+ (void)initialize;
{
    OBINITIALIZE;

    punctuationCharacterSet = [[OFCharacterSet alloc] initWithString:@";{}()[]"];
    whitespaceCharacterSet = [[OFCharacterSet alloc] initWithString:@" \t\r\n\f"];
    digitCharacterSet = [[OFCharacterSet alloc] initWithCharacterSet:[NSCharacterSet decimalDigitCharacterSet]];
    hexadecimalCharacterSet = [[OFCharacterSet alloc] initWithCharacterSet:[NSCharacterSet characterSetWithCharactersInString:@"0123456789abcdefABCDEF"]];

    identifierStartCharacterSet = [[OFCharacterSet alloc] initWithCharacterSet:[NSCharacterSet letterCharacterSet]];
    [identifierStartCharacterSet addCharactersInString:@"\\"];
    invertedNameCharacterSet = [[OFCharacterSet alloc] initWithCharacterSet:[[NSCharacterSet alphanumericCharacterSet] invertedSet]];
    [invertedNameCharacterSet removeCharactersInString:@"-_"]; // Note: '_' isn't really allowed here according to CSS2's syntax rules for nmchar (at http://www.w3.org/TR/REC-CSS2/syndata.html), but other browsers seem to allow it
    
    backslashQuoteCharacterSet = [[OFCharacterSet alloc] initWithCharacterSet:[NSCharacterSet characterSetWithCharactersInString:@"\\\""]];
    backslashSingleQuoteValueCharacterSet = [[OFCharacterSet alloc] initWithCharacterSet:[NSCharacterSet characterSetWithCharactersInString:@"\\'"]];
    whitespaceBackslashClosingParenthesisCharacterSet = [[OFCharacterSet alloc] initWithString:@" \t\r\n\f\\)"];
    invertedCommaRightParenthesisCharacterSet = [[OFCharacterSet alloc] initWithString:@",)"];
    [invertedCommaRightParenthesisCharacterSet invert];

}

// Init and dealloc

- initWithScanner:(OFCharacterScanner *)aScanner;
{
    if (![super init])
        return self;

    scanner = [aScanner retain];

    return self;
}

- (void)dealloc;
{
    if (scannerHasRewindMark)
        [scanner discardRewindMark];
    [scanner release];
    [super dealloc];
}


// API

- (OWCSSTokenType)getNextToken:(id *)nextTokenPointer;
{
    return [self getNextToken:nextTokenPointer excludingIdentifiers:NO excludingNumbers:NO];
}

- (OWCSSTokenType)getNextToken:(id *)nextTokenPointer skipWhitespace:(BOOL)skipWhitespace;
{
    OWCSSTokenType tokenType;
    id tokenValue;
    
    tokenType = [self getNextToken:&tokenValue];
    if (skipWhitespace && tokenType == OWCSSTokenWhitespace)
        tokenType = [self getNextToken:&tokenValue];

    *nextTokenPointer = tokenValue;
    return tokenType;
}

- (OWCSSTokenType)getNextToken:(id *)nextTokenPointer excludingIdentifiers:(BOOL)excludingIdentifiers excludingNumbers:(BOOL)excludingNumbers;
{
    if (scannerHasRewindMark)
        [scanner discardRewindMark];
    [scanner setRewindMark];
    scannerHasRewindMark = YES;

    return [self _readRealNextToken:nextTokenPointer excludingIdentifiers:excludingIdentifiers excludingNumbers:excludingNumbers];
}

- (void)ungetLastToken;
{
    if (!scannerHasRewindMark)
        [NSException raise:@"Tokenizer Error" format:@"CSS bug: can't unget twice in a row."];
        
    scannerHasRewindMark = NO;
    [scanner rewindToMark];
}

- (BOOL)skipTokensUpToAndIncludingPunctuation:(NSString *)punctuationString;
{
    OWCSSTokenType tokenType;
    id tokenValue;
    
    while ((tokenType = [self getNextToken:&tokenValue]) != OWCSSTokenEOF)
        if (tokenType == OWCSSTokenPunctuation && [punctuationString containsString:tokenValue])
            return YES;
    
    return NO;
}


@end

@implementation OWCSSTokenizer (Private)

- (OWCSSTokenType)_readRealNextToken:(id *)nextTokenPointer excludingIdentifiers:(BOOL)excludingIdentifiers excludingNumbers:(BOOL)excludingNumbers;
{
    unichar character;

    *nextTokenPointer = nil;

    character = scannerPeekCharacter(scanner);
    
    if (character == OFCharacterScannerEndOfDataCharacter)
        return OWCSSTokenEOF;

    // HTML-style comments (brackets themselves are skipped, contents are processed normally)
    if (character == [cdoString characterAtIndex:0]) {
        if ([scanner scanString:cdoString peek:NO])
            return [self getNextToken:nextTokenPointer];
    } else if (character == [cdcString characterAtIndex:0]) {
        if ([scanner scanString:cdcString peek:NO])
            return [self getNextToken:nextTokenPointer];
    }
    
    // CSS-style comments (contents are skipped)
    if (character == [commentStart characterAtIndex:0]) {
        if ([scanner scanString:commentStart peek:NO]) {
            if ([scanner scanUpToString:commentEnd] && [scanner scanString:commentEnd peek:NO])
                return [self getNextToken:nextTokenPointer];
            return OWCSSTokenEOF;
        }
    }

    // Whitespace (collapsed, skipping comments)
    if (OFCharacterSetHasMember(whitespaceCharacterSet, character)) {
        while (1) {
            character = scannerPeekCharacter(scanner);
            if (!OFCharacterSetHasMember(whitespaceCharacterSet, character))
                break;
    
            scannerScanUpToCharacterNotInOFCharacterSet(scanner, whitespaceCharacterSet);
    
            // HTML-style comments (brackets themselves are skipped, contents are processed normally)
            character = scannerPeekCharacter(scanner);
            if (character == [cdoString characterAtIndex:0]) {
                if ([scanner scanString:cdoString peek:NO])
                    continue;
            } else if (character == [cdcString characterAtIndex:0]) {
                if ([scanner scanString:cdcString peek:NO])
                    continue;
            }
            
            // CSS-style comments (contents are skipped)
            if (character == [commentStart characterAtIndex:0]) {
                if ([scanner scanString:commentStart peek:NO]) {
                    if ([scanner scanUpToString:commentEnd] && [scanner scanString:commentEnd peek:NO])
                        continue;
                    // TODO: How shall we handle unmatched comments?  Skip to the end of the input?
                }
            }
            break;
        }
        return OWCSSTokenWhitespace;
        
    // Number
    } else if (!excludingNumbers && (OFCharacterSetHasMember(digitCharacterSet, character) || character == '.' || character == '-')) {
        double decimalPlace = 0.0;
        double doubleValue = 0.0;
        OWCSSIdentifier *unitsIdentifier = nil;
        double signValue = 1.0;
        
        if (character == '.') {
            scannerSkipPeekedCharacter(scanner);
            character = scannerPeekCharacter(scanner);
            // If the next character is not a digit, then this period is just punctuation
            if (!OFCharacterSetHasMember(digitCharacterSet, character)) {
                *nextTokenPointer = @".";
                return OWCSSTokenPunctuation;
            }

            decimalPlace = 0.1;
        } else if (character == '-') {
            scannerSkipPeekedCharacter(scanner);
            character = scannerPeekCharacter(scanner);
            // If the next character is not a digit or period, then this minus is just punctuation
            if (!OFCharacterSetHasMember(digitCharacterSet, character) && character != '.' ) {
                *nextTokenPointer = @"-";
                return OWCSSTokenPunctuation;
            }

            signValue = -1.0;
        }
        
        while (1) {
            if (OFCharacterSetHasMember(digitCharacterSet, character)) {
                if (decimalPlace > 0.0) {
                    doubleValue += (character - '0') * decimalPlace;
                    decimalPlace /= 10.0;
                } else {
                    doubleValue = (doubleValue * 10) + (character - '0');
                }
            } else if (character == '.') {
                if (decimalPlace > 0.0) // we already hit one period
                    break;
                decimalPlace = 0.1;
            } else
                break;
        
            scannerSkipPeekedCharacter(scanner);
            character = scannerPeekCharacter(scanner);
        }
        
        if (OFCharacterSetHasMember(identifierStartCharacterSet, character))
            unitsIdentifier = (OWCSSIdentifier *)[scanner readLongestTrieElement:[OWCSSIdentifier identifierTrie] delimiterOFCharacterSet:invertedNameCharacterSet];
        else if (character == '%') {
            scannerSkipPeekedCharacter(scanner);
            unitsIdentifier = OWCSSPercentIdentifier;
        }
        
        *nextTokenPointer = [OWCSSNumber numberWithFloatValue:(float)(doubleValue * signValue) unitsIdentifier:unitsIdentifier];
        return OWCSSTokenNumber;

    // Badly-formed microsoft-style hex number (only hit this case if excludingNumbers is YES)
    // ...we see this on msnbc.com: {color:00cc00}
    } else if (OFCharacterSetHasMember(digitCharacterSet, character)) {
        *nextTokenPointer = [self _readEscapableStringWithSpecials:invertedNameCharacterSet inQuotes:NO];
        return OWCSSTokenString;

    // Name
    } else if (character == '#') {
        scannerSkipPeekedCharacter(scanner);
        *nextTokenPointer = [@"#" stringByAppendingString:[self _readEscapableStringWithSpecials:invertedNameCharacterSet inQuotes:NO]];
        return OWCSSTokenString;
    
    // Identifier
    } else if (OFCharacterSetHasMember(identifierStartCharacterSet, character)) {
        NSString *name;
        
        if (!excludingIdentifiers) {
            OWCSSIdentifier *cssIdentifier;
            
            cssIdentifier = (OWCSSIdentifier *)[scanner readLongestTrieElement:[OWCSSIdentifier identifierTrie] delimiterOFCharacterSet:invertedNameCharacterSet];
            if (cssIdentifier != nil) {
                if (scannerPeekCharacter(scanner) == '(') {
                    *nextTokenPointer = [self _readFunctionWithStartToken:cssIdentifier];
                    return OWCSSTokenFunction;
                } else {
                    *nextTokenPointer = cssIdentifier;
                    return OWCSSTokenIdentifier;
                }
            }
        }
        
        name = [self _readEscapableStringWithSpecials:invertedNameCharacterSet inQuotes:NO];
        character = scannerPeekCharacter(scanner);
        if (character == '(') {
            *nextTokenPointer = [self _readFunctionWithStartToken:[name lowercaseString]];
            return OWCSSTokenFunction;
        } else {
            *nextTokenPointer = name;
            return OWCSSTokenString;
        }
        
    // Quoted string
    } else if (character == '"' || character == '\'') {
        *nextTokenPointer = [self _readQuotedString];
        return OWCSSTokenString;
    }

    // Otherwise, assume it's some kind of punctuation
    scannerSkipPeekedCharacter(scanner);
    *nextTokenPointer = [NSString stringWithCharacter:character];
    return OWCSSTokenPunctuation;
}

- (NSArray *)_readFunctionWithStartToken:(id)startToken;
{
    NSMutableArray *parameterArray;

    // It's a function
    parameterArray = [NSMutableArray array];
    [parameterArray addObject:startToken];
    
    scannerSkipPeekedCharacter(scanner); // (
    
    // Push an extra rewind mark onto the scanner stack so that new rewind marks replace this one rather than replacing the existing rewind mark which points at the beginning of the function.  This ensures that -ungetLastToken will unget the entire function, rather than just the last token in the function.
    [scanner setRewindMark];

    if (startToken == OWCSSURLIdentifier) {
        unichar character;
        NSString *scannedString;
    
        scannerScanUpToCharacterNotInOFCharacterSet(scanner, whitespaceCharacterSet);

        character = scannerPeekCharacter(scanner);
        if (character == '"' || character == '\'')
            scannedString = [self _readQuotedString];
        else
            scannedString = [self _readEscapableStringWithSpecials:whitespaceBackslashClosingParenthesisCharacterSet inQuotes:NO];

        scannerScanUpToCharacter(scanner, ')'); // Warning: if we get bad input in a URL, we won't necessarily recover perfectly because we would hit a quoted parenthesis.  On the other hand, what do you want from bad input, huh?
        scannerSkipPeekedCharacter(scanner);

        [parameterArray addObject:scannedString];
    } else {
        NSString *tokenValue = @"";
        
        while (1) {
            OWCSSTokenType tokenType;
            unichar character;
            
            scannerScanUpToCharacterNotInOFCharacterSet(scanner, whitespaceCharacterSet);

            character = scannerPeekCharacter(scanner);
            if (character == ',') {
                scannerSkipPeekedCharacter(scanner);
                [parameterArray addObject:tokenValue];
                tokenValue = @"";
                continue;
            } else if (character == ')') {
                scannerSkipPeekedCharacter(scanner);
                [parameterArray addObject:tokenValue];
                break;
            } else if (character == OFCharacterScannerEndOfDataCharacter)
                break;
                
            tokenType = [self getNextToken:&tokenValue];
            if (tokenType == OWCSSTokenEOF)
                break;
            if (tokenValue == nil)
                tokenValue = @"";
            
            scannerScanUpToCharacterNotInOFCharacterSet(scanner, invertedCommaRightParenthesisCharacterSet);
        }
    }

    [scanner discardRewindMark]; // Restores our rewind mark to the beginning of the function (see -setRewindMark call above)

    return parameterArray;
}

- (NSString *)_readEscapedCharactersInsideOfQuotes:(BOOL)isInsideQuotes;
{
    unichar quotedCharacter;

    // See section 4.1.3 "Characters and case" in http://www.w3.org/TR/CSS2/syndata.html 
    scannerSkipPeekedCharacter(scanner);
    quotedCharacter = scannerPeekCharacter(scanner);
    
    if (quotedCharacter == OFCharacterScannerEndOfDataCharacter) {
        return @"";
    } else if (quotedCharacter == '\r' || quotedCharacter == '\n') {
        scannerSkipPeekedCharacter(scanner);
        if (isInsideQuotes)
            return @"";
        else
            return @"\n";
    } else if (OFCharacterSetHasMember(hexadecimalCharacterSet, quotedCharacter)) {
        unsigned int originalScanLocation;
        unichar hexCharacter;
        
        originalScanLocation = [scanner scanLocation];
        hexCharacter = [scanner scanHexadecimalNumberMaximumDigits:6];
        if (([scanner scanLocation] - originalScanLocation < 6) && OFCharacterSetHasMember(whitespaceCharacterSet, scannerPeekCharacter(scanner)))
            scannerSkipPeekedCharacter(scanner);
        
        return [NSString stringWithCharacter:hexCharacter];
    } else {
        scannerSkipPeekedCharacter(scanner);
        return [NSString stringWithCharacter:quotedCharacter];
    }
}

- (NSString *)_readQuotedString;
{
    unichar quoteCharacter;
    OFCharacterSet *pauseCharacterSet;
    NSString *buildingString;

    quoteCharacter = scannerPeekCharacter(scanner);
    scannerSkipPeekedCharacter(scanner);
    
    if (quoteCharacter == '\'')
        pauseCharacterSet = backslashSingleQuoteValueCharacterSet;
    else
        pauseCharacterSet = backslashQuoteCharacterSet;
    
    buildingString = [self _readEscapableStringWithSpecials:pauseCharacterSet inQuotes:YES];
    
    scannerSkipPeekedCharacter(scanner);
    return buildingString;
}

- (NSString *)_readEscapableStringWithSpecials:(OFCharacterSet *)specialCharactersSet inQuotes:(BOOL)readingInsideQuotes
{
    NSString *buffer = @""; // note that Foundation optimizes the case of [@"" stringByAppendingString:foo] to return foo
    
    while (1) {
        NSString *partialToken;
        unichar pauseCharacter;
        
        /* Read up to the next special character */
        partialToken = [scanner readFullTokenWithDelimiterOFCharacterSet:specialCharactersSet];
        if (partialToken != nil)
            buffer = [buffer stringByAppendingString:partialToken];
        
        /* Check why we stopped. If because of a backslash, interpret it. Otherwise, we've hit a delimiter, so return our string-so-far. */
        pauseCharacter = scannerPeekCharacter(scanner);
        if (pauseCharacter == '\\')
            buffer = [buffer stringByAppendingString:[self _readEscapedCharactersInsideOfQuotes:readingInsideQuotes]];
        else
            break;
    }
    
    return buffer;
}


@end
