// Copyright 1997-2001 Omni Development, Inc.  All rights reserved.
//
// This software may only be used and reproduced according to the
// terms in the file OmniSourceLicense.html, which should be
// distributed with this project and can also be found at
// http://www.omnigroup.com/DeveloperResources/OmniSourceLicense.html.

#import <OWF/OWUnknownDataStreamProcessor.h>

#import <Foundation/Foundation.h>
#import <OmniBase/OmniBase.h>
#import <OmniFoundation/OmniFoundation.h>

#import <OWF/OWAddress.h>
#import <OWF/OWContentType.h>
#import <OWF/OWDataStream.h>
#import <OWF/OWDataStreamCursor.h>
#import <OWF/OWPipeline.h>
#import <OWF/OWURL.h>

RCS_ID("$Header: /Network/Source/CVS/OmniGroup/Frameworks/OWF/Processors.subproj/OWUnknownDataStreamProcessor.m,v 1.19 2001/08/01 02:36:39 wiml Exp $")

@interface OWUnknownDataStreamProcessor (Private)
@end

static OWContentType *unknownContentType, *unknownDecodedContentType;
static OWContentType *textPlainContentType;
static OWContentType *applicationOctetStreamContentType;
static NSMutableDictionary *guessPrefixDictionary;
static NSMutableDictionary *guessAnywhereDictionary;

@implementation OWUnknownDataStreamProcessor

+ (void)initialize;
{
    static BOOL initialized = NO;

    [super initialize];
    if (initialized)
	return;
    initialized = YES;

    guessPrefixDictionary = [[NSMutableDictionary alloc] initWithCapacity:32];
    guessAnywhereDictionary = [[NSMutableDictionary alloc] initWithCapacity:32];

    unknownContentType = [OWContentType contentTypeForString:@"www/unknown"];
    unknownDecodedContentType = [OWContentType contentTypeForString:@"OWDataStream/UnknownDecodedContent"];
    textPlainContentType = [OWContentType contentTypeForString:@"text/plain"];
    applicationOctetStreamContentType = [OWContentType contentTypeForString:@"application/octet-stream"];
}

+ (void)didLoad;
{
    [self registerProcessorClass:self fromContentType:unknownContentType toContentType:[OWContentType wildcardContentType] cost:1.0];
    [self registerProcessorClass:self fromContentType:unknownDecodedContentType toContentType:[OWContentType wildcardContentType] cost:1.0];
}

+ (OWContentType *)unknownContentType;
{
    return unknownContentType;
}

static inline void
readGuessesIntoDictionary(NSMutableDictionary *dictionary, id guessObject, OWContentType *contentType)
{
    if ([guessObject isKindOfClass:[NSArray class]]) {
	NSEnumerator *guessEnumerator;
	id newGuessObject;
    
	guessEnumerator = [(NSArray *)guessObject objectEnumerator];
	while ((newGuessObject = [guessEnumerator nextObject]))
            readGuessesIntoDictionary(dictionary, newGuessObject, contentType);
    } else if ([guessObject isKindOfClass:[NSString class]]) {
	[dictionary setObject:contentType forKey:guessObject];
    } else if ([guessObject isKindOfClass:[NSData class]]) {
    	[dictionary setObject:contentType forKey:[[[NSString alloc] initWithData:guessObject encoding:NSMacOSRomanStringEncoding] autorelease]];

    }
}

+ (void)registerGuessesDictionary:(NSDictionary *)guessesDictionary;
{
    NSEnumerator *contentTypeEnumerator;
    NSString *contentTypeString;
    NSEnumerator *guessDictionaryEnumerator;

    contentTypeEnumerator = [guessesDictionary keyEnumerator];
    guessDictionaryEnumerator = [guessesDictionary objectEnumerator];

    while ((contentTypeString = [contentTypeEnumerator nextObject])) {
	OWContentType *contentType;
	NSDictionary *guessDictionary;

	contentType = [OWContentType contentTypeForString:contentTypeString];
	guessDictionary = [guessDictionaryEnumerator nextObject];

	readGuessesIntoDictionary(guessPrefixDictionary, [guessDictionary objectForKey:@"prefix"], contentType);
	readGuessesIntoDictionary(guessAnywhereDictionary, [guessDictionary objectForKey:@"anywhere"], contentType);
    }
}


//

- (OWContentType *)contentTypeGuessForData:(NSData *)data;
{
    unsigned const char *buffer;
    int length;
    int index;
    NSEnumerator *guessEnumerator;
    NSString *guessString;
    int controlCount;
    int textCount;
    int linefeedCount;
    int highCount;
    NSString *string;
    
    // TODO: Attempt to guess character encoding of text streams?
    // (WIM:) I think this is actually a bad idea. The charset= header is widely understood now, and most places do use it. Pandering to the few places that don't use it will only ensure that every browser ever written will have to ignore the RFCs and run the guessing algorithm. I'd rather not support that.

    // First we try to guess from the first 1000 or so characters of the file.  This catches .jpgs that are really .gifs and vice-versa.
    buffer = [data bytes];
    length = [data length];

    // Currently hardcoded to NSMacOSRomanStringEncoding for Mac OS X.
    string = [[NSString alloc] initWithData:data encoding:NSMacOSRomanStringEncoding];
    guessEnumerator = [guessPrefixDictionary keyEnumerator];
    while ((guessString = [guessEnumerator nextObject])) {
        if ([string hasPrefix:guessString]) {
            [string release];
	    return [guessPrefixDictionary objectForKey:guessString];
        }
    }

    guessEnumerator = [guessAnywhereDictionary keyEnumerator];
    while ((guessString = [guessEnumerator nextObject])) {
        if ([string containsString:guessString]) {
            [string release];
	    return [guessAnywhereDictionary objectForKey:guessString];
        }
    }
    [string release];

    // We couldn't guess from the contents, so let's guess based on the filename
    if ([[pipeline lastAddress] isKindOfClass:[OWAddress class]]) {
        OWContentType *guessContentType;
        
        guessContentType = [(OWAddress *)[pipeline lastAddress] probableContentTypeBasedOnPath];
        if (guessContentType != unknownContentType)
            return guessContentType;
    }

    // Try a heuristic based on the ratio of text to line feeds (and no control characters).
    textCount = 0;
    controlCount = 0;
    linefeedCount = 0;
    highCount = 0;
    index = length;
    while (index--) {
        unsigned char ch;

        ch = buffer[index];
        switch (ch) {
            case '\n':
                linefeedCount++;
                break;
            case '\r':
            case '\f': // ignore FF
                break;
            case '\t':
                textCount++;
                break;
            default:
                if (ch < 32)
                    controlCount++;
                else if (ch < 128)
                    textCount++;
                else
                    highCount++;
        }
    }

    // This is the same questionable heuristic that the CERN library uses.
    if (controlCount == 0 || (textCount + linefeedCount >= 16 * (controlCount + highCount)))
	return textPlainContentType;
    else
	return applicationOctetStreamContentType;
}


- (void)process;
{
    OWContentType *contentType;
    OWDataStream *dataStream;
    NSData *headerData = nil;

    [self setStatusString:NSLocalizedStringFromTableInBundle(@"Taking a guess at content type", @"OWF", [OWUnknownDataStreamProcessor bundle], unknowndatastreamprocessor status)];
    NS_DURING {
	headerData = [dataCursor readBytes:1024];
    } NS_HANDLER {
	if (![[localException name] isEqualToString:@"Underflow"])
	    [localException raise];
        else
	    headerData = [dataCursor readAllData];
    } NS_ENDHANDLER;

    dataStream = [dataCursor dataStream];
    contentType = [self contentTypeGuessForData:headerData];
    if ([contentType isEncoding]) {
        NSLog(@"%@: Guessing content encoding is %@", [[pipeline lastAddress] addressString], [contentType contentTypeString]);
        [dataStream setContentType:unknownDecodedContentType];
	[dataStream setContentEncoding:contentType];
    } else {
        NSLog(@"%@: Guessing content type is %@", [[pipeline lastAddress] addressString], [contentType contentTypeString]);
	[dataStream setContentType:contentType];
    }
    if ([pipeline contextObjectForKey:@"OWUnknownDataStreamProcessorContent"] != dataStream) {
        // Let's make sure we don't retype this stream
        [pipeline setContextObject:dataStream forKey:@"OWUnknownDataStreamProcessorContent"];
    } else {
        // We've typed this stream once already, time to punt
        [dataStream setContentType:applicationOctetStreamContentType];
    }
    [pipeline addContent:dataStream];
    [pipeline startProcessingContent];
}


@end
