sbCharsetDetector.cpp
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set sw=2 :miv */
3 /*
4  *=BEGIN SONGBIRD GPL
5  *
6  * This file is part of the Songbird web player.
7  *
8  * Copyright(c) 2005-2010 POTI, Inc.
9  * http://www.songbirdnest.com
10  *
11  * This file may be licensed under the terms of of the
12  * GNU General Public License Version 2 (the ``GPL'').
13  *
14  * Software distributed under the License is distributed
15  * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
16  * express or implied. See the GPL for the specific language
17  * governing rights and limitations.
18  *
19  * You should have received a copy of the GPL along with this
20  * program. If not, go to http://www.gnu.org/licenses/gpl.html
21  * or write to the Free Software Foundation, Inc.,
22  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23  *
24  *=END SONGBIRD GPL
25  */
26 
27 //------------------------------------------------------------------------------
28 //------------------------------------------------------------------------------
29 //
30 // Songbird charset detect utility.
31 //
32 //------------------------------------------------------------------------------
33 //------------------------------------------------------------------------------
34 
40 //------------------------------------------------------------------------------
41 //
42 // Songbird charset detect utility imported services.
43 //
44 //------------------------------------------------------------------------------
45 
46 // Self imports.
47 #include "sbCharsetDetector.h"
48 
49 // Mozilla imports.
50 #include <nsComponentManagerUtils.h>
51 #include <nsICharsetDetector.h>
52 
53 // Songbird imports.
54 #include <sbStringUtils.h>
55 
56 /* Windows Specific */
57 #if defined(XP_WIN)
58  #include <windows.h>
59 #endif
60 
61 //------------------------------------------------------------------------------
62 //
63 // Songbird charset detect utilities nsISupports implementation.
64 //
65 //------------------------------------------------------------------------------
66 
69  nsICharsetDetectionObserver)
70 
71 
72 //------------------------------------------------------------------------------
73 //
74 // Songbird charset detect utilities sbICharsetDetector implementation.
75 //
76 //------------------------------------------------------------------------------
77 
78 NS_IMETHODIMP
79 sbCharsetDetector::GetIsCharsetFound(PRBool *aIsCharsetFound)
80 {
81  NS_ENSURE_ARG_POINTER(aIsCharsetFound);
82 
83  *aIsCharsetFound = mIsCharsetFound;
84  return NS_OK;
85 }
86 
87 NS_IMETHODIMP
88 sbCharsetDetector::Detect(const nsACString& aStringToDetect)
89 {
90  nsresult rv;
91 
92  // Already have the answer for the charset.
93  if (mIsCharsetFound)
94  return NS_OK;
95 
96  if (!mDetector) {
97  mDetector = do_CreateInstance(
98  NS_CHARSET_DETECTOR_CONTRACTID_BASE "universal_charset_detector");
99 
100  nsCOMPtr<nsICharsetDetectionObserver> observer =
101  do_QueryInterface(NS_ISUPPORTS_CAST(nsICharsetDetectionObserver*, this));
102  NS_ENSURE_TRUE(observer, NS_ERROR_NO_INTERFACE);
103 
104  rv = mDetector->Init(observer);
105  NS_ENSURE_SUCCESS(rv, rv);
106  }
107 
108  // see if it's valid utf8; if yes, assume it _is_ indeed utf8 for now.
109  // Do not set mIsCharsetFound as there could be more incoming data so that
110  // we can get a better answer.
111  nsDependentCString raw(aStringToDetect.BeginReading());
112  if (IsLikelyUTF8(raw) && IsUTF8(raw)) {
113  // this is utf8
114  mDetectedCharset.AssignLiteral("UTF-8");
115  return NS_OK;
116  }
117 
118  // the metadata is in some 8-bit encoding; try to guess
119  rv = RunCharsetDetector(aStringToDetect);
120  if (NS_SUCCEEDED(rv) && !mLastCharset.IsEmpty()) {
121  mDetectedCharset.Assign(mLastCharset);
122  if (eSureAnswer == mLastConfidence || eBestAnswer == mLastConfidence)
123  mIsCharsetFound = PR_TRUE;
124 
125  return NS_OK;
126  }
127 
128 #if XP_WIN
129  // we have no idea what charset this is, but we know it's bad.
130  // for Windows only, assume CP_ACP
131 
132  // make the call fail if it's not valid CP_ACP
133  const char *str = aStringToDetect.BeginReading();
134  int size = MultiByteToWideChar(CP_ACP,
135  MB_ERR_INVALID_CHARS,
136  str,
137  aStringToDetect.Length(),
138  nsnull,
139  0);
140  if (size) {
141  // okay, so CP_ACP is usable
142  mDetectedCharset.AssignLiteral("CP_ACP");
143  return NS_OK;
144  }
145 #endif
146 
147  // we truely know nothing
148  mDetectedCharset.Truncate();
149  return NS_OK;
150 }
151 
152 NS_IMETHODIMP
153 sbCharsetDetector::Finish(nsACString& _retval)
154 {
155  // The charset detection is not done yet. Get the best answer so far.
156  if (!mIsDone) {
157  if (mDetector) {
158  nsresult rv = mDetector->Done();
159  NS_ENSURE_SUCCESS(rv, rv);
160  }
161  if (!mLastCharset.IsEmpty())
162  mDetectedCharset = mLastCharset;
163  }
164 
165  mLastConfidence = eNoAnswerYet;
166  mIsCharsetFound = PR_FALSE;
167  mIsDone = PR_FALSE;
168  // sadly, the detector has no way to unregister a listener; in order to break
169  // the reference cycle between this and mDetector, we need to let it go and
170  // just make a new one the next time we need it.
171  mDetector = nsnull;
172 
173  _retval = mDetectedCharset;
174 
175  return NS_OK;
176 }
177 
178 //------------------------------------------------------------------------------
179 //
180 // Songbird charset detect nsICharsetDetectionObserver implementation.
181 //
182 //------------------------------------------------------------------------------
183 
184 NS_IMETHODIMP
185 sbCharsetDetector::Notify(const char *aCharset, nsDetectionConfident aConf)
186 {
187  mLastCharset.AssignLiteral(aCharset);
188  mLastConfidence = aConf;
189  return NS_OK;
190 }
191 
192 //------------------------------------------------------------------------------
193 //
194 // Songbird charset detect utilities public services.
195 //
196 //------------------------------------------------------------------------------
197 
203 : mLastConfidence(eNoAnswerYet),
204  mIsCharsetFound(PR_FALSE),
205  mIsDone(PR_FALSE)
206 {
207 }
208 
209 
215 {
216 }
217 
218 //------------------------------------------------------------------------------
219 //
220 // Songbird charset detect utilities private methods.
221 //
222 //------------------------------------------------------------------------------
223 
224 nsresult sbCharsetDetector::RunCharsetDetector(const nsACString& aStringToDetect)
225 {
226  NS_ENSURE_TRUE(mDetector, NS_ERROR_NOT_INITIALIZED);
227  nsresult rv = NS_OK;
228 
229  if (NS_SUCCEEDED(rv)) {
230  PRUint32 chunkSize = aStringToDetect.Length();
231  const char *str = aStringToDetect.BeginReading();
232  rv = mDetector->DoIt(str, chunkSize, &mIsDone);
233  NS_ENSURE_SUCCESS(rv, rv);
234  if (mIsDone) {
235  rv = mDetector->Done();
236  NS_ENSURE_SUCCESS(rv, rv);
237  }
238  }
239  return rv;
240 }
Songbird Charset Detector Definitions.
return NS_OK
NS_DECL_ISUPPORTS NS_DECL_SBICHARSETDETECTOR sbCharsetDetector()
PRBool IsUTF8(const nsACString &aString)
NS_IMPL_THREADSAFE_ISUPPORTS2(sbCharsetDetector, sbICharsetDetector, nsICharsetDetectionObserver) NS_IMETHODIMP sbCharsetDetector
A helper class to detect the string charset.
PRBool IsLikelyUTF8(const nsACString &aString)
let observer
NS_IMETHOD Notify(const char *aCharset, nsDetectionConfident aConf)